Summary of DF as matrix
我已经搜索了很长时间,但找不到真正的简单方法。
我有一个仅包含数字值的df,我想从我的df中创建一个摘要矩阵。
1 2 3 4 5 6 | DF V1 V2 V3 V4 V5 ... x1 y1 z1 1 c1 x2 NA z2 0 c2 x3 y3 z3 1 NA ... |
V4最初是一个TRUE / FALSE变量,已转换为通常应该起作用的数字变量。
我想获得以下内容:
1 2 3 4 5 6 7 | N Mean SD Min 1st Median 3rd Max V1 V2 V3 V4 V5 ... |
具有相应的N,Mean,SD,Min,1st,Median,3rd,Max。
我已经尝试过简单
我尝试了由于某种原因而无法工作的观星仪(我猜是因为我有二进制变量)
1 2 | stargazer(DF, type="html", nobs = TRUE, type="html", mean.sd = TRUE, median = TRUE, iqr = TRUE, + digits=2, align=T) |
,我读到一些有关qwraps2_summary_table的信息。但是它们似乎都为我提供了不一样的桌子设计。
我知道我也可以像这样运行循环:
1 2 3 4 | for(i in (1:length(DF)){ sum$N<-(????) sum$Mean<-mean(DF[i]) ....} |
但这不是最好的解决方案。
有小费吗?谢谢!
这是我的数据集的一部分
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | structure(list(Year = c(2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2018, 2011), Occurences = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 2L, 9L), Balance = c(-1.14, 1.05, -1.06, 1.01, 1.01, 1.01, -1.09, -1, -1.04, -1.03), Withdrawal = c(43200, 41080, 43400, 43183, 42600, 42100, 45900, 46000, 3892008, 48374), Verification_SA = c(NA, NA, NA, NA, 1, 1, NA, 1, 1, NA), Classification_num = c(NA, NA, NA, NA, 3, 2, NA, 4, 4, NA), Interaction_Verification_Classification = c(NA, NA, NA, NA, 3, 2, NA, 4, 4, NA), KnowledgeSources = c(1, 1, 1, 0, 1, 1, 1, 1, 1, 0), KnowledgeDischarge = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 0), Scarcity_watershed = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_ ), Scarcity_country = c(NA, NA, NA, NA, NA, NA, NA, NA, 3.35, NA), Knowledge_Watershed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Knowledge_Facilities = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 1), Importance_num = c(NA, NA, NA, 3, 3, 3, 3, 3, 5, NA), DetrimentalImpacts_num = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Responsibility_num = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 1)), row.names = c(NA, -10L), class = c("tbl_df","tbl","data.frame" )) |
以伊恩·坎贝尔的答案为基础,在出现需要时不要害怕建立汇总函数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | summaryfn <- function(x){ c(min(x), quantile(x,0.25,na.rm=TRUE), quantile(x,0.5,na.rm=TRUE), mean(x,na.rm=TRUE), sd(x, na.rm=TRUE), quantile(x,0.75,na.rm=TRUE), max(x,na.rm=TRUE), sum(is.na(x))) } res <- do.call(rbind,lapply(df,summaryfn)) colnames(res) <- c("Min","Q1","Med","Mean","Sd","Q3","Max","NAs") ## > res ## Min Q1 Med Mean Sd Q3 Max NAs ## Year 2011.00 2012.250 2014.500 2014.500000 2.718251e+00 2016.75 2018.00 0 ## Occurences 2.00 9.000 9.000 8.300000 2.213594e+00 9.00 9.00 0 ## Balance -1.14 -1.055 -1.015 -0.228000 1.074800e+00 1.01 1.05 0 ## Withdrawal 41080.00 42745.750 43300.000 428784.500000 1.216855e+06 45975.00 3892008.00 0 ## Verification_SA NA 1.000 1.000 1.000000 0.000000e+00 1.00 1.00 6 ## Classification_num NA 2.750 3.500 3.250000 9.574271e-01 4.00 4.00 6 ## Interaction_Verification_Classification NA 2.750 3.500 3.250000 9.574271e-01 4.00 4.00 6 ## KnowledgeSources 0.00 1.000 1.000 0.800000 4.216370e-01 1.00 1.00 0 ## KnowledgeDischarge 0.00 0.000 0.000 0.400000 5.163978e-01 1.00 1.00 0 ## Scarcity_watershed NA NA NA NaN NA NA -Inf 10 ## Scarcity_country NA 3.350 3.350 3.350000 NA 3.35 3.35 9 ## Knowledge_Watershed 0.00 0.000 0.000 0.000000 0.000000e+00 0.00 0.00 0 ## Knowledge_Facilities 0.00 0.000 0.000 0.200000 4.216370e-01 0.00 1.00 0 ## Importance_num NA 3.000 3.000 3.333333 8.164966e-01 3.00 5.00 4 ## DetrimentalImpacts_num 0.00 0.000 0.000 0.100000 3.162278e-01 0.00 1.00 0 ## Responsibility_num 1.00 1.000 2.000 1.800000 7.888106e-01 2.00 3.00 0 ## > str(res) ## num [1:16, 1:8] 2011 2 -1.14 41080 NA ... ## - attr(*,"dimnames")=List of 2 ## ..$ : chr [1:16]"Year""Occurences""Balance""Withdrawal" ... ## ..$ : chr [1:8]"Min""Q1""Med""Mean" ... |
尽管在很多情况下
1 | print(xtable(res),type="html") |
如果以后有人发现此问题,请根据@camille的建议进行检查。这是一种简单的方法,尽管它不提供
1 2 3 4 5 6 7 8 9 10 11 | library(psych) my_summary <- do.call(rbind,lapply(DF,psych::describe,quant=c(0.25,0.75))) my_summary # vars n mean sd median trimmed mad min max range skew kurtosis se Q0.25 Q0.75 #Year 1 10 2014.50 2.72 2014.50 2014.50 3.71 2011.00 2018.00 7.00 0.00 -1.74 0.86 2012.25 2016.75 #Occurences 1 10 8.30 2.21 9.00 9.00 0.00 2.00 9.00 7.00 -2.28 3.57 0.70 9.00 9.00 #Balance 1 10 -0.23 1.07 -1.02 -0.27 0.15 -1.14 1.05 2.19 0.35 -2.05 0.34 -1.06 1.01 #Withdrawal 1 10 428784.50 1216854.64 43300.00 44344.62 2535.25 41080.00 3892008.00 3850928.00 2.28 3.57 384803.22 42745.75 45975.00 #Verification_SA 1 4 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN NaN 0.00 1.00 1.00 #Classification_num 1 4 3.25 0.96 3.50 3.25 0.74 2.00 4.00 2.00 -0.32 -2.08 0.48 2.75 4.00 ... |
这是我最初的解决方案,使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | library(data.table) my_summary <- rbindlist(lapply(DF, function(x){ as.data.frame(t(c( summary(x), SD = sd(x,na.rm=TRUE), N = sum(!is.na(x))))) }) , fill = TRUE, use.names = TRUE,idcol="Variable") my_summary # Variable Min. 1st Qu. Median Mean 3rd Qu. Max. SD N NA's # 1: Year 2011.00 2012.250 2014.500 2014.500000 2016.75 2018.00 2.718251e+00 10 NA # 2: Occurences 2.00 9.000 9.000 8.300000 9.00 9.00 2.213594e+00 10 NA # 3: Balance -1.14 -1.055 -1.015 -0.228000 1.01 1.05 1.074800e+00 10 NA # 4: Withdrawal 41080.00 42745.750 43300.000 428784.500000 45975.00 3892008.00 1.216855e+06 10 NA # 5: Verification_SA 1.00 1.000 1.000 1.000000 1.00 1.00 0.000000e+00 4 6 # 6: Classification_num 2.00 2.750 3.500 3.250000 4.00 4.00 9.574271e-01 4 6 |
我们可以使用
的单个data.frame输出
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | library(purrr) library(dplyr) map_dfr(DF, ~ tibble(SD = sd(.x, na.rm = TRUE), N = sum(!is.na(.x)), as.data.frame.list(base::summary(.x))), .id = 'Variable') # A tibble: 16 x 10 # SD N Min. X1st.Qu. Median Mean X3rd.Qu. Max. NA.s Variable # * <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> # 1 2.72e+0 10 2011 2012. 2014. 2.01e+3 2017. 2.02e3 NA Year # 2 2.21e+0 10 2 9 9 8.30e+0 9 9.00e0 NA Occurences # 3 1.07e+0 10 -1.14 -1.06 -1.02 -2.28e-1 1.01 1.05e0 NA Balance # 4 1.22e+6 10 41080 42746. 43300 4.29e+5 45975 3.89e6 NA Withdrawal # 5 0. 4 1 1 1 1.00e+0 1 1.00e0 6 Verification_SA # 6 9.57e-1 4 2 2.75 3.5 3.25e+0 4 4.00e0 6 Classification_num # 7 9.57e-1 4 2 2.75 3.5 3.25e+0 4 4.00e0 6 Interaction_Verificatio… # 8 4.22e-1 10 0 1 1 8.00e-1 1 1.00e0 NA KnowledgeSources # 9 5.16e-1 10 0 0 0 4.00e-1 1 1.00e0 NA KnowledgeDischarge #10 NA 0 NA NA NA NaN NA NA 10 Scarcity_watershed #11 NA 1 3.35 3.35 3.35 3.35e+0 3.35 3.35e0 9 Scarcity_country #12 0. 10 0 0 0 0. 0 0. NA Knowledge_Watershed #13 4.22e-1 10 0 0 0 2.00e-1 0 1.00e0 NA Knowledge_Facilities #14 8.16e-1 6 3 3 3 3.33e+0 3 5.00e0 4 Importance_num #15 3.16e-1 10 0 0 0 1.00e-1 0 1.00e0 NA DetrimentalImpacts_num #16 7.89e-1 10 1 1 2 1.80e+0 2 3.00e0 NA Responsibility_num |