本帖最后由 yinyuemi 于 2011-09-02 09:24 編輯
- for((i=1;i<=1000;i++)); do echo '02,10,11,18,27,30
- 06,09,11,14,20,31
- 02,10,11,18,27,33
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 02,10,11,16,18,27
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- 04,05,09,15,21,30' >>testfile; done
- wc -l testfile
- 9000 testfile
- 4#:
- time awk -F, '{s=0;m=1;for(i=1;i<=NF;i++)s+=$i^3;for(i=1;i<=NF;i++)m*=(!a[s-$i^3]++);if(m)print}' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.154s
- user 0m0.154s
- sys 0m0.000s
- 10#:(ctrl-c 終止程序)
- time awk -F, '{T="";for(n=0;n++<NF;){T=T","a[$n];a[$n]=a[$n]","NR};split(T,t,",");f=0;for(n in t){if(t[n]=="")continue;if(gsub(t[n],"",T)>=5)f=1};if(f==0)print $0}' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 10m20.540s
- user 10m18.173s
- sys 0m0.982s
- 12#:
- time awk -F, '{for(i=1;i<=l;i++){s=0;for(j=1;j<=NF;j++)if(index(a[i],$j))s++;if(s>=5)next}a[++l]=$0}1' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.128s
- user 0m0.127s
- sys 0m0.001s
- 21#:
- time awk '
- function f(a,b){return "#"gensub(",","#"b"#","g",a)"#";}
- {e=0
- {for(i=1;i<NR;i++)
- if(split(f(a[i],""),x,f($0,"|"))>=6){e=1;break}
- }
- if(!e){a[NR]=$0;print $0}
- }' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.841s
- user 0m0.837s
- sys 0m0.001s
- 4#waker兄的效率很高,不過適用于數(shù)值型數(shù)據(jù)
- 10# 通用性好,不過效率上要差
- 12# 如Tim兄所言,數(shù)值是2位的,效率很高
- 21# 通用性上稍差,如果是文本去重復(fù)的,且文本中包含正則符號或","或"#",可能會有問題(10#的代碼使用gsub可能也有類似問題,沒測試),效率上比4#和12#的要差
復(fù)制代碼 |