使用具有可变binwidths和因子的R

使用具有可变binwidths和因子的R - 频率计数(Using R - frequency counts with variable binwidths and factors)

我有一个相当大的数据集（超过100万行），其中一个小样本在这里：

structure(list(Feret = c(0.017, 0.016, 2.12, 0.016, 0.02, 0.023, 0.017, 0.021, 0.02, 0.016, 0.027, 0.052, 0.061, 0.033, 0.041, 0.017, 6.561, 7.123, 0.027, 0.018, 0.024, 4.099, 0.022, 0.025, 0.037, 0.037, 0.018, 0.039, 0.027, 0.053, 0.016, 0.107, 0.52, 0.041, 0.038, 0.039, 0.03, 0.071, 0.022, 0.118, 0.032, 0.018, 0.027, 0.035, 8.113, 0.078, 4.089, 0.035, 0.057, 6.905, 2.5, 0.282, 0.045, 0.039, 0.071, 0.037, 0.029, 0.027, 0.016, 0.02, 0.026, 0.025, 0.026, 0.016, 0.016, 0.021), sample.type = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("flower", "leaf"), class = "factor"), leaf.side = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("lower", "upper"), class = "factor"), canopy = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("bottom", "top"), class = "factor"), treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("blue", "green", "grey", "white", "yel-green" ), class = "factor")), .Names = c("Feret", "sample.type", "leaf.side", "canopy", "treatment"), row.names = c(500000L, 500001L, 500002L, 500003L, 500004L, 500005L, 500006L, 500007L, 500008L, 500009L, 500010L, 800000L, 800001L, 800002L, 800003L, 800004L, 800005L, 800006L, 800007L, 800008L, 800009L, 800010L, 1000L, 1001L, 1002L, 1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 1010L, 10000L, 10001L, 10002L, 10003L, 10004L, 10005L, 10006L, 10007L, 10008L, 10009L, 10010L, 100000L, 100001L, 100002L, 100003L, 100004L, 100005L, 100006L, 100007L, 100008L, 100009L, 100010L, 1160000L, 1160001L, 1160002L, 1160003L, 1160004L, 1160005L, 1160006L, 1160007L, 1160008L, 1160009L, 1160010L), class = "data.frame")

我一直在尝试使用以下二进制宽度创建'Feret'变量的频率计数：

bins <- c(0.01,0.03,0.1,0.3,1,3,10)

然后使用：

freq<-hist(df_temp$Feret, breaks=bins) ranges<-paste(head(bins,-1),bins[-1],sep=" - ") freq$counts df5<-data.frame(ranges = ranges, frequency = freq$counts) df5

但我真正需要做的是将data.frame分解为各种因素（“sample.type”，“leaf.side”，“canopy”，“treatment”）并提取每个子集的频率计数。我可以通过手动创建每个子集来实现这种冗长的方式，但我希望以更好的方式做到这一点。我尝试使用循环来创建子集，然后将hist（）函数应用于每个子集，但这需要很长时间。使用Dplyr还是Apply有更好的方法？我宁愿只将结果放在一个表格中，然后根据需要绘制它们。

I have quite a large dataset (over 1 million rows) of which a small sample is here:

structure(list(Feret = c(0.017, 0.016, 2.12, 0.016, 0.02, 0.023, 0.017, 0.021, 0.02, 0.016, 0.027, 0.052, 0.061, 0.033, 0.041, 0.017, 6.561, 7.123, 0.027, 0.018, 0.024, 4.099, 0.022, 0.025, 0.037, 0.037, 0.018, 0.039, 0.027, 0.053, 0.016, 0.107, 0.52, 0.041, 0.038, 0.039, 0.03, 0.071, 0.022, 0.118, 0.032, 0.018, 0.027, 0.035, 8.113, 0.078, 4.089, 0.035, 0.057, 6.905, 2.5, 0.282, 0.045, 0.039, 0.071, 0.037, 0.029, 0.027, 0.016, 0.02, 0.026, 0.025, 0.026, 0.016, 0.016, 0.021), sample.type = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("flower", "leaf"), class = "factor"), leaf.side = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("lower", "upper"), class = "factor"), canopy = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("bottom", "top"), class = "factor"), treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("blue", "green", "grey", "white", "yel-green" ), class = "factor")), .Names = c("Feret", "sample.type", "leaf.side", "canopy", "treatment"), row.names = c(500000L, 500001L, 500002L, 500003L, 500004L, 500005L, 500006L, 500007L, 500008L, 500009L, 500010L, 800000L, 800001L, 800002L, 800003L, 800004L, 800005L, 800006L, 800007L, 800008L, 800009L, 800010L, 1000L, 1001L, 1002L, 1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 1010L, 10000L, 10001L, 10002L, 10003L, 10004L, 10005L, 10006L, 10007L, 10008L, 10009L, 10010L, 100000L, 100001L, 100002L, 100003L, 100004L, 100005L, 100006L, 100007L, 100008L, 100009L, 100010L, 1160000L, 1160001L, 1160002L, 1160003L, 1160004L, 1160005L, 1160006L, 1160007L, 1160008L, 1160009L, 1160010L), class = "data.frame")

I have been trying to create frequency counts of the 'Feret' variable with the following binswidths:

bins <- c(0.01,0.03,0.1,0.3,1,3,10)

and then using:

freq<-hist(df_temp$Feret, breaks=bins) ranges<-paste(head(bins,-1),bins[-1],sep=" - ") freq$counts df5<-data.frame(ranges = ranges, frequency = freq$counts) df5

But what I really need to do is split the data.frame up by the various factors ("sample.type","leaf.side","canopy", "treatment") and extract frequency counts for each subset. I can do this the long winded way by manually creating each subset but I would like to do it a better way. I've tried using loops to create the subsets and then apply the hist() function to each subset, but it was taking a very long time. Is there a better way using Dplyr or Apply? I'd prefer to just to have the results in a table and then I can plot them as required.

最满意答案

以下片段应该做你想做的事情：

我将你的示例加载到df 。

library("dplyr") df %>% group_by(sample.type, leaf.side, canopy, treatment) %>% dplyr::select(Feret) %>% do(data.frame(table(cut(.$Feret, breaks=bins, include.lowest=T))))

我把你介绍给dplyr文档。简而言之， x %>% f是f(x) ， x -> f(a)是f(x,a) 。

请注意， dplyr::select只是select ，但我有很多次的命名空间问题，现在我总是指定包。

table(cut(df$Feret, breaks=bins))只是一个更好的方式来做你用hist做的事情。使用cut ，您可以创建一个因子变量（记住如果您的值可以达到下限，请添加include.lowest = T）并使用table计算每个级别的频率。

这给出：

sample.type leaf.side canopy treatment Var1 Freq 1 flower upper top green (0.01,0.03] 0 2 flower upper top green (0.03,0.1] 6 3 flower upper top green (0.1,0.3] 1 4 flower upper top green (0.3,1] 0 5 flower upper top green (1,3] 1 6 flower upper top green (3,10] 3 7 flower upper top white (0.01,0.03] 4 8 flower upper top white (0.03,0.1] 4 9 flower upper top white (0.1,0.3] 0 10 flower upper top white (0.3,1] 0 11 flower upper top white (1,3] 0 12 flower upper top white (3,10] 3 13 leaf lower bottom white (0.01,0.03] 5 14 leaf lower bottom white (0.03,0.1] 4 15 leaf lower bottom white (0.1,0.3] 1 16 leaf lower bottom white (0.3,1] 1 17 leaf lower bottom white (1,3] 0 18 leaf lower bottom white (3,10] 0 19 leaf lower top grey (0.01,0.03] 10 20 leaf lower top grey (0.03,0.1] 1 21 leaf lower top grey (0.1,0.3] 0 22 leaf lower top grey (0.3,1] 0 23 leaf lower top grey (1,3] 0 24 leaf lower top grey (3,10] 0 25 leaf upper bottom white (0.01,0.03] 4 26 leaf upper bottom white (0.03,0.1] 6 27 leaf upper bottom white (0.1,0.3] 1 28 leaf upper bottom white (0.3,1] 0 29 leaf upper bottom white (1,3] 0 30 leaf upper bottom white (3,10] 0 31 leaf upper top blue (0.01,0.03] 10 32 leaf upper top blue (0.03,0.1] 0 33 leaf upper top blue (0.1,0.3] 0 34 leaf upper top blue (0.3,1] 0 35 leaf upper top blue (1,3] 1 36 leaf upper top blue (3,10] 0

（实际上，它不会像这样打印，因为这是一个tbl，但你可以使用print.data.frame以旧的方式打印tbl。）

从这里可以直接提取你想要的信息。

The following snippet should do what you want:

I loaded your sample into df.

library("dplyr") df %>% group_by(sample.type, leaf.side, canopy, treatment) %>% dplyr::select(Feret) %>% do(data.frame(table(cut(.$Feret, breaks=bins, include.lowest=T))))

I refer you to the dplyr documentation. In short, x %>% f is f(x) and x -> f(a) is f(x,a).

Note that dplyr::select is just select, but I have had namespace issue so many times that now I always specify the package.

table(cut(df$Feret, breaks=bins)) is just a nicer way to do what you did with hist. With cut, you create a factor variable (Remember to add include.lowest=T if your values can reach the lower bound) and with table, you count the frequency of each level.

This gives:

sample.type leaf.side canopy treatment Var1 Freq 1 flower upper top green (0.01,0.03] 0 2 flower upper top green (0.03,0.1] 6 3 flower upper top green (0.1,0.3] 1 4 flower upper top green (0.3,1] 0 5 flower upper top green (1,3] 1 6 flower upper top green (3,10] 3 7 flower upper top white (0.01,0.03] 4 8 flower upper top white (0.03,0.1] 4 9 flower upper top white (0.1,0.3] 0 10 flower upper top white (0.3,1] 0 11 flower upper top white (1,3] 0 12 flower upper top white (3,10] 3 13 leaf lower bottom white (0.01,0.03] 5 14 leaf lower bottom white (0.03,0.1] 4 15 leaf lower bottom white (0.1,0.3] 1 16 leaf lower bottom white (0.3,1] 1 17 leaf lower bottom white (1,3] 0 18 leaf lower bottom white (3,10] 0 19 leaf lower top grey (0.01,0.03] 10 20 leaf lower top grey (0.03,0.1] 1 21 leaf lower top grey (0.1,0.3] 0 22 leaf lower top grey (0.3,1] 0 23 leaf lower top grey (1,3] 0 24 leaf lower top grey (3,10] 0 25 leaf upper bottom white (0.01,0.03] 4 26 leaf upper bottom white (0.03,0.1] 6 27 leaf upper bottom white (0.1,0.3] 1 28 leaf upper bottom white (0.3,1] 0 29 leaf upper bottom white (1,3] 0 30 leaf upper bottom white (3,10] 0 31 leaf upper top blue (0.01,0.03] 10 32 leaf upper top blue (0.03,0.1] 0 33 leaf upper top blue (0.1,0.3] 0 34 leaf upper top blue (0.3,1] 0 35 leaf upper top blue (1,3] 1 36 leaf upper top blue (3,10] 0

(Actually, it doesn't print like this since this is a tbl, but you can use print.data.frame to print a tbl the old way.)

From here it should be straightforward to extract the info you want.

更多推荐