关于dplyr:R中按月总和除以组的汇总

aggregation by sum of month divided by groups in R

此处为mydata

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
mydata=structure(list(doc_date = structure(c(7L, 9L, 4L, 10L, 2L, 5L,
8L, 1L, 3L, 6L), .Label = c("01.06.2018","06.04.2018","08.07.2018",
"14.03.2018","20.04.2018","21.09.2018","24.01.2018","25.05.2018",
"28.02.2018","28.03.2018"), class ="factor"), shop_id = c(67885L,
67885L, 67885L, 67885L, 67885L, 67885L, 67885L, 67885L, 67885L,
67885L), shop_code = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label ="02293НСК", class ="factor"), product_id = c(11622L,
11622L, 11622L, 11622L, 11622L, 11622L, 11622L, 11622L, 11622L,
11622L), product_group_id = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L), city_id = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L),
    fin_centre_id = c(15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
    15L, 15L), return_count = c(2L, 3L, 1L, 1L, 1L, 1L, 3L, 1L,
    3L, 2L)), .Names = c("doc_date","shop_id","shop_code",
"product_id","product_group_id","city_id","fin_centre_id",
"return_count"), class ="data.frame", row.names = c(NA, -10L
))

如何为组shop_code product_id汇总return_count列
每个月,按枢轴格式的总和。

I.E。输出

1
2
  jan feb march apr may jun jul aug sept oct nov dec
1   2   3     2   2   3   1   3   0    2   0   0   0

该主题不是重复的
每组汇总/汇总多个变量(例如,总和,均值)
因为我需要数据透视格式


问题在于您具有相同月份的不同日期(也有不同的日期),因此首先我们将在月份级别进行汇总,然后进行调整。试试这个:

1
2
3
4
5
6
 mydata$new_date <- dmy(mydata$doc_date) # convert to date format)
 mydata$month <- month(mydata$new_date) # extract month from date

 mydata <- mydata %>% group_by(shop_code,product_id,month) %>% summarise(return_count= sum(return_count)) # group at your required level

 mydata_1 <- dcast(setDT(mydata), shop_code + product_id  ~ month , fun.aggregate = sum, value.var = c("return_count")) # Pivot up using dcast

这是一种data.table方法:

编辑后的结果中包含0个计数的月份

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
library(data.table)
library(lubridate)
setDT(mydata)

# First make a variable storing the month
mydata[, month := lubridate::month(as.Date(doc_date, format ="%d.%m.%y"), label = TRUE)]

# Then sum return_count by the product id, group id and month. Keep only rows that are unique by month
mydata <- unique(mydata[, sum := sum(return_count), by = .(product_id, product_group_id, month), ], by ="month")

# Now we need to make sure any months with 0 counts are included
all_months <- data.table(month = lubridate::month(1:12, label = TRUE) )
mydata <- merge(mydata[, .(month, sum)], all_months, by ="month", all.y = TRUE)
mydata[is.na(sum), sum := 0]

## output
 month sum
 1:   Jan   2
 2:   Feb   3
 3:   Mar   2
 4:   Apr   2
 5:   May   3
 6:   Jun   1
 7:   Jul   3
 8:   Aug   0
 9:   Sep   2
10:   Oct   0
11:   Nov   0
12:   Dec   0

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
> require(tidyverse)
> mydata$months <- months(dmy(mydata$doc_date))
> my <- mydata %>% group_by( months) %>% summarise(re_count = sum(return_count,na.rm = T))
> my

# A tibble: 8 x 2
  months    re_count
  <chr>        <int>
1 April            2
2 Februar          3
3 Januar           2
4 Juli             3
5 Juni             1
6 Mai              3
7 M?rz             2
8 September        2
>

将是我使用tidyverse方法提出的解决方案。 (抱歉,无论出于何种原因,我的月份都用德语)。