我想做的是将df1字符串与df2字符串匹配,并获得类似的字符串
什么我做的就是这样
#一个函数,用于安排数据为每个字符串设置ID normalize< - 函数(x,delim){x < - gsub(),,x,fixed = TRUE)x < - gsub((,,x,fixed = ) idx< - rep(seq_len(length(x)),times = nchar(gsub(sprintf([^%s],delim),,as.character(x)))+ 1)名称< - unlist(strsplit(as.character(x),delim)) return(setNames(idx,names))} #一个函数来排列第二个df lookup< - normalize(df2 [,1],,) #一个函数来匹配它们,并给出ID 过程< - function(s){ lookup_try< - lookup [names(s)] found< - which(!is.na(lookup_try)) pos < lookup_try [names(s)[found]] return(paste(s [found],pos,sep = - ))#将最后一行替换为return (as.character(pos))只得到评论中的结果}然后我得到这样的结果
res< - lapply(colnames(df1),function(x)process标准化(df1 [,x],;)))来自df1的每个字符串和匹配的df2的字符串行数。所以这个数据的输出看起来像这样
> res $ s1 [1]3-44-15-4 $ s2 [1]2-4 3-157-16第一列 ID 是与df1中的字符串匹配的df2的行号df1 第二列否是匹配的次数第三列 ID-col-n 是df1中与字符串匹配的字符串的行号+它们的列名称第四个是与该字符串匹配的df1的第一列的字符串第五列是字符串的第二列匹配该字符串等等
解决方案在这种情况下,我发现更容易切换数据到宽格式,然后将其合并到查找表。
您可以尝试:
library(tidyr)库(dplyr) df1_tmp< - df1 df2_tmp< - df2 #add数字id to df1_tmp保留行信息 df1_tmp $ id < - seq_along(df1_tmp [,1])$ b $ b#切换到具有多个字符串的宽和不排序的行 df1_tmp< - gather(df1_tmp,key =s_val,value =query_string, - id) df1_tmp< df1_tmp%>% mutate(query_string = strsplit(as.character(query_string),;))%>% unnest(query_string) df2_tmp $ IDs。 < - gsub([()],,df2_tmp $ IDs。) #add数字ID到df1_tmp以保留行信息 df2_tmp $ id < - seq_along df2_tmp $ ID。) 带有多个字符串的#unnest行 df2_tmp< - df2_tmp%>% mutate(IDs。= strsplit(as.character(IDs。 ,))%>% unnest(IDs。) res< - merge(df1_tmp,df2_tmp,by.x =query_string,by.y = ID $ res $ ID_col_n< - paste(paste0(res $ id.x,res $ s_val)) res $ total_id< - 1:nrow(res) res< - spread(res,s_val,value = query_string,fill = NA) res #summarize获取必需的输出 res< - res% >%group_by(id.y)%>% mutate(No = n())%>%group_by(id.y,No)%>% summarise_each(funs (。[!is.na(。)],collapse =,)))%>% select(-id.x,-total_id) colnames(res) [colnames(res)==id.y]< - IDs res $ df1_colMatch_counts< - rowSums(res [, - (1:3)]!=) df2_counts< - df2_tmp%>%gr oup_by(id)%>%summaryize(df2_string_counts = n()) res< - merge(res,df2_counts,by.x =IDs,by.y =id) res res ID否ID_col_n s1 s2 df1_colMatch_counts df2_string_counts 1 1 1 4s1 P41182 1 2 2 2 1 4s1 P41182 1 2 3 3 1 4s1 P41182 1 2 4 4 3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831 2 4 5 15 1 3s2 P54612 1 5 6 16 1 7s2 O15143 1 7
I have been busy with this question since last night and I could not figure out how to do it.
What I want to do is to match df1 strings to df2 strings and get the similar ones out
what I do is like this
# a function to arrange the data to have IDs for each string normalize <- function(x, delim) { x <- gsub(")", "", x, fixed=TRUE) x <- gsub("(", "", x, fixed=TRUE) idx <- rep(seq_len(length(x)), times=nchar(gsub(sprintf("[^%s]",delim), "", as.character(x)))+1) names <- unlist(strsplit(as.character(x), delim)) return(setNames(idx, names)) } # a function to arrange the second df lookup <- normalize(df2[,1], ",") # a function to match them and give the IDs process <- function(s) { lookup_try <- lookup[names(s)] found <- which(!is.na(lookup_try)) pos <- lookup_try[names(s)[found]] return(paste(s[found], pos, sep="-")) #change the last line to "return(as.character(pos))" to get only the result as in the comment }then I get the results like this
res <- lapply(colnames(df1), function(x) process(normalize(df1[,x], ";")))This gives me the row number of each string from df1 and row number of string from df2 that matched. so the output of this data looks like this
> res $s1 [1] "3-4" "4-1" "5-4" $s2 [1] "2-4" "3-15" "7-16"The first column IDs is the row number of df2 which matched with strings in df1 The second column No is the number of times it matched The third column ID-col-n is the row number of string in df1 which matched with that string + their column name the forth is string from first column of the df1 which matched with that string the fifth column is the string of second column which matched with that string and so on
解决方案In this case I find it easier to switch the data to the wide format and before merging it to the lookup table.
You could try:
library(tidyr) library(dplyr) df1_tmp <- df1 df2_tmp <- df2 #add numerical id to df1_tmp to keep row information df1_tmp$id <- seq_along(df1_tmp[,1]) #switch to wide and unnest rows with several strings df1_tmp <- gather(df1_tmp,key="s_val",value="query_string",-id) df1_tmp <- df1_tmp %>% mutate(query_string = strsplit(as.character(query_string), ";")) %>% unnest(query_string) df2_tmp$IDs. <- gsub("[()]", "", df2_tmp$IDs.) #add numerical id to df1_tmp to keep row information df2_tmp$id <- seq_along(df2_tmp$IDs.) #unnest rows with several strings df2_tmp <- df2_tmp %>% mutate(IDs. = strsplit(as.character(IDs.), ",")) %>% unnest(IDs.) res <- merge(df1_tmp,df2_tmp,by.x="query_string",by.y="IDs.") res$ID_col_n <- paste(paste0(res$id.x,res$s_val)) res$total_id <- 1:nrow(res) res <- spread(res,s_val,value=query_string,fill=NA) res #summarize to get required output res <- res %>% group_by(id.y) %>% mutate(No=n()) %>% group_by(id.y,No) %>% summarise_each(funs(paste(.[!is.na(.)],collapse=","))) %>% select(-id.x,-total_id) colnames(res)[colnames(res)=="id.y"]<-"IDs" res$df1_colMatch_counts <- rowSums(res[,-(1:3)]!="") df2_counts <- df2_tmp %>% group_by(id) %>% summarize(df2_string_counts=n()) res <- merge(res,df2_counts,by.x="IDs",by.y="id") res res IDs No ID_col_n s1 s2 df1_colMatch_counts df2_string_counts 1 1 1 4s1 P41182 1 2 2 2 1 4s1 P41182 1 2 3 3 1 4s1 P41182 1 2 4 4 3 2s2,3s1,5s1 Q9Y6Q9,Q09472 Q92831 2 4 5 15 1 3s2 P54612 1 5 6 16 1 7s2 O15143 1 7
更多推荐
如何重新排列两个数据帧之间的匹配顺序
发布评论