others - shell 使用 sed/awk从文本文件中提取行, 其中行与前一行有N 个常用词

  显示原文与译文双语对照的内容
51 4

这是一个示例文本文件:

word1 word2 word3 word4
word4 word5 word6 word7
word6 word7 word8 word9
word9 word6 word8 word3
word1 word4 word5 word4

提取前一行中的N个普通单词的命令是什么?

在示例文件中,提取带有 3个常用单词的行,上一行将输出:

word9 word6 word8 word3

注意:使用编程语言( 提取 array_sentence1.uniq

时间:原作者:0个回答

131 3

以下是AWK中的解决方案:

▶ cat> FILE <<EOF
word1 word2 word3 word4
word4 word5 word6 word7
word6 word7 word8 word9
word9 word6 word8 word3
word1 word4 word5 word4
EOF

我原来的解决方案在这里。它假定每行中的字是唯一的。

# script.awk
NR> 1 { # On lines other than the first:
 split(last, last_ar) # Split the last record and the
 split($0, curr_ar) # current record.
 found = 0 # Count how many words curr_ar
 for (i in curr_ar) # and last_ar have in common.
 for (j in last_ar)
 if (last_ar[j] == curr_ar[i])
 found++
 if (found> = 3) print #. . . and print this record
 # if 3 or more were found.
}
{
 last = $0 # On all lines.
} 

为了处理唯一性,我修改了这个解决方案,它使用了GNU的awk函数,它也在 Mac OS X 上的nawk中:

# script.gawk
NR> 1 {
 split(last, last_ar)
 split($0, curr_ar)
 delete found # Count how many unique occurrences
 for (i in curr_ar) # of words are seen.
 for (j in last_ar)
 if (last_ar[j] == curr_ar[i])
 found[curr_ar[i]]++
 if (length(found)> = 3) print
}
{
 last = $0
}

测试:

▶ gawk -f script.gawk FILE
word9 word6 word8 word3
原作者:
132 5
$ cat tst.awk
{
 delete seen
 cnt = 0
 for (i=1; i<=NF; i++) {
 word = $i
 cnt += (!seen[word]++ && prev[word]? 1 : 0 )
 }
 if (cnt> = 3) {
 print
 }
 delete prev
 for (word in seen) {
 prev[word]++
 }
}
$ awk -f tst.awk file
word9 word6 word8 word3
原作者:
92 0

你可以通过使用哈希来确保唯一值,这里是一个示例脚本:

parse.awk

# Only start checking from the second line
NR> 1 {
 c = 0 # Variable to hold the common word count
 # Run through unique words and compare to previous line
 for(i=1; i<=NF; i++) {
 if( $i in h &&!($i in g) ) {
 c++
 g[$i]
 }
 }
 # Reset the associative arrays
 delete h
 delete g
}
# If we had enough matches print the current line
c> = N
# Collect current line into the h associative array
{
 for(i=1; i<=NF; i++)
 h[$i]
}

像这样运行:

awk -f parse.awk N=3 infile

输出:

word9 word6 word8 word3
原作者:
70 0

一种方法:

$ awk '{x=0;for(i=1;i<=NF;i++)if ($i in a)x++;split("",a);for(i=1;i<=NF;i++){a[$i]};}x==3' file
word9 word6 word8 word3

将行内容存储在关联数组中。然后检查关联数组并增加计数器x 。

原作者:
140 4

替代解决方案:

awk '{
 c=0; 
 for(i=1;i<=NF;i++)
 {
 if(l[$i]){c+=1}
 }
 }
 {
 delete l; 
 for(i=1;i<=NF;i++)
 {
 l[$i]=1
 }
 } 
 c>=3' <your file>
原作者:
61 4
$ echo '
> word1 word2 word3 word4
> word4 word5 word6 word7
> word6 word7 word8 word9
> word9 word6 word8 word3
> word1 word4 word5 word4
> ' | awk -v n=3 '
> NR == 1 { for (i = 1; i <= NF; i++) { word[$i]++ } }
> NR> 1 { counter = 0
> for (i = 1; i <= NF; i++) {
> if (word[$i]--> 0) counter++ }
> if (counter> = n) print $0
> delete word
> for (i = 1; i <= NF; i++) { word[$i]++ } }
> '
word9 word6 word8 word3
原作者:
130 3

如果你的数据在 d 文件中,尝试过 gnu awk

awk 'NR==1{for(;i++<NF;)a[i]=$i;next} {for(i=0;i++<NF;){for(j in a){if($i==a[j])c++;if(c==3){print;exit}}}; c=0;i=length(a);NF+=i;for(j=0;i<NF;)a[++i]=$++j} ' d
原作者:
...