others - shell 使用 sed/awk从文本文件中提取行, 其中行与前一行有N 个常用词

51 4

这是一个示例文本文件:


word1 word2 word3 word4


word4 word5 word6 word7


word6 word7 word8 word9


word9 word6 word8 word3


word1 word4 word5 word4



提取前一行中的N个普通单词的命令是什么?

在示例文件中,提取带有 3个常用单词的行,上一行将输出:


word9 word6 word8 word3



注意:使用编程语言( 提取 array_sentence1.uniq

时间: 原作者:

131 3

以下是AWK中的解决方案:


▶ cat> FILE <<EOF


word1 word2 word3 word4


word4 word5 word6 word7


word6 word7 word8 word9


word9 word6 word8 word3


word1 word4 word5 word4


EOF



我原来的解决方案在这里。 它假定每行中的字是唯一的。


# script.awk



NR> 1 { # On lines other than the first:


 split(last, last_ar) # Split the last record and the


 split($0, curr_ar) # current record.



 found = 0 # Count how many words curr_ar


 for (i in curr_ar) # and last_ar have in common.


 for (j in last_ar)


 if (last_ar[j] == curr_ar[i])


 found++



 if (found> = 3) print #. . . and print this record


 # if 3 or more were found.


}



{


 last = $0 # On all lines.


} 



为了处理唯一性,我修改了这个解决方案,它使用了GNU的awk函数,它也在 Mac OS X 上的nawk中:


# script.gawk



NR> 1 {


 split(last, last_ar)


 split($0, curr_ar)



 delete found # Count how many unique occurrences


 for (i in curr_ar) # of words are seen.


 for (j in last_ar)


 if (last_ar[j] == curr_ar[i])


 found[curr_ar[i]]++



 if (length(found)> = 3) print


}



{


 last = $0


}



测试:


▶ gawk -f script.gawk FILE


word9 word6 word8 word3



原作者:
132 5

$ cat tst.awk


{


 delete seen


 cnt = 0


 for (i=1; i<=NF; i++) {


 word = $i


 cnt += (!seen[word]++ && prev[word]? 1 : 0 )


 }



 if (cnt> = 3) {


 print


 }



 delete prev


 for (word in seen) {


 prev[word]++


 }


}



$ awk -f tst.awk file


word9 word6 word8 word3



原作者:
92 0

你可以通过使用哈希来确保唯一值,这里是一个示例脚本:

parse.awk


# Only start checking from the second line


NR> 1 {


 c = 0 # Variable to hold the common word count



 # Run through unique words and compare to previous line


 for(i=1; i<=NF; i++) {


 if( $i in h &&!($i in g) ) {


 c++


 g[$i]


 }


 }



 # Reset the associative arrays


 delete h


 delete g


}



# If we had enough matches print the current line


c> = N



# Collect current line into the h associative array


{


 for(i=1; i<=NF; i++)


 h[$i]


}



像这样运行:


awk -f parse.awk N=3 infile



输出:


word9 word6 word8 word3



原作者:
70 0

一种方法:


$ awk '{x=0;for(i=1;i<=NF;i++)if ($i in a)x++;split("",a);for(i=1;i<=NF;i++){a[$i]};}x==3' file


word9 word6 word8 word3



将行内容存储在关联数组中。 然后检查关联数组并增加计数器x 。

原作者:
140 4

替代解决方案:


awk '{


 c=0; 


 for(i=1;i<=NF;i++)


 {


 if(l[$i]){c+=1}


 }


 }


 {


 delete l; 


 for(i=1;i<=NF;i++)


 {


 l[$i]=1


 }


 } 


 c>=3' <your file>



61 4

$ echo '


> word1 word2 word3 word4


> word4 word5 word6 word7


> word6 word7 word8 word9


> word9 word6 word8 word3


> word1 word4 word5 word4


> ' | awk -v n=3 '


> NR == 1 { for (i = 1; i <= NF; i++) { word[$i]++ } }


> NR> 1 { counter = 0


> for (i = 1; i <= NF; i++) {


> if (word[$i]--> 0) counter++ }


> if (counter> = n) print $0


> delete word


> for (i = 1; i <= NF; i++) { word[$i]++ } }


> '


word9 word6 word8 word3



原作者:
130 3

如果你的数据在 d 文件中,尝试过 gnu awk


awk 'NR==1{for(;i++<NF;)a[i]=$i;next} {for(i=0;i++<NF;){for(j in a){if($i==a[j])c++;if(c==3){print;exit}}}; c=0;i=length(a);NF+=i;for(j=0;i<NF;)a[++i]=$++j} ' d



...