te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>bash - How to match patterns from one file against a specific column in another file using grep? - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

bash - How to match patterns from one file against a specific column in another file using grep? - Stack Overflow

programmeradmin3浏览0评论

file1, which contains a single string per line. I want to check if each string exists in the second column of file2. file2 contains two single space-separated strings per line, but some lines may have leading spaces before the first column.

I want to use only grep and/or cut to perform the match and output matching lines from file2 to newFile.txt, ensuring whole word matching (-w).

I've tried

grep -wF -f file1 file2 > newFile.txt 

but due to the file size terminal runs infinitely.

I've also tried

grep -wF -f <(cut -d ' ' -f 2 file2) | grep -wF -f - file2 > newFile.txt 

This only works for some lines in file2 because some lines have multiple spaces before the 2 strings, although those strings are only separated by single space.

File1:

 aaa
 bbb
 ccc

File2:

 a aaa (should match) 
     b bbb (should match and does but the former spaces throws off cut -d, resulting in incomplete output of line in the newFile.txt)
 c cc (should not match) 

Question: How can I efficiently extract and match whole words in the second column of file2.txt, while handling inconsistent leading spaces? I prefer using grep and/or cut, but I'm open to small modifications.

file1, which contains a single string per line. I want to check if each string exists in the second column of file2. file2 contains two single space-separated strings per line, but some lines may have leading spaces before the first column.

I want to use only grep and/or cut to perform the match and output matching lines from file2 to newFile.txt, ensuring whole word matching (-w).

I've tried

grep -wF -f file1 file2 > newFile.txt 

but due to the file size terminal runs infinitely.

I've also tried

grep -wF -f <(cut -d ' ' -f 2 file2) | grep -wF -f - file2 > newFile.txt 

This only works for some lines in file2 because some lines have multiple spaces before the 2 strings, although those strings are only separated by single space.

File1:

 aaa
 bbb
 ccc

File2:

 a aaa (should match) 
     b bbb (should match and does but the former spaces throws off cut -d, resulting in incomplete output of line in the newFile.txt)
 c cc (should not match) 

Question: How can I efficiently extract and match whole words in the second column of file2.txt, while handling inconsistent leading spaces? I prefer using grep and/or cut, but I'm open to small modifications.

Share Improve this question asked 2 days ago ChuepapiiiChuepapiii 911 gold badge2 silver badges9 bronze badges 7
  • 1 Are you sure you wouldn't rather use awk? It's built for searching, and knows about columns. – Gordon Davisson Commented 2 days ago
  • awk works, but I want to know how grep/cut would work – Chuepapiii Commented 2 days ago
  • "but due to the file size terminal runs infinitely" If it is really a matter of file size, then it is likely file1 size. One thing that you could try is splitting file1 in smaller chunks (e.g. with split), process them independently, and concatenate the outputs. You could measure the time T(1) it takes with a 1-line file1, progressively increase the number of lines, and find the n value that minimizes T(n) / n. – Renaud Pacalet Commented 2 days ago
  • please update the question with the size of both files; the complete output from wc file1 file2 should be sufficient – markp-fuso Commented 2 days ago
  • fwiw, the grep ... | grep ... script is invalid syntax and is missing any reference to file1 – markp-fuso Commented 2 days ago
 |  Show 2 more comments

2 Answers 2

Reset to default 2

Asking for help to do this efficiently with grep and cut is like asking for help constructing a garden fence with a kitchen fork and a paperclip. They're simply not the right tools for the job and so they cannot be used efficiently for this, nor can they be used robustly (or portably) without adding yet more tools to the mix to help them out. An awk-only solution, by contrast, would be trivial, efficient, and portable, e.g. the following will work using any POSIX awk:

$ awk 'NR == FNR{ tgts[$1]; next } $2 in tgts' file1 file2
 a aaa (should match)
     b bbb (should match and does but the former spaces throws off cut -d, resulting in incomplete output of line in the newFile.txt)

Original answer before I noticed the OP said "I want to check if each string exists in the second column of file2. file2 contains two single space-separated strings per line" and thought they wanted to match all "words" in file2:

$ cat tst.awk
NR == FNR {
    tgts[$1]
    next
}
{
    split($0, words, /[^[:alnum:]_]+/)
    for ( i in words ) {
        if ( words[i] in tgts ) {
            print
            next
        }
    }
}

$ awk -f tst.awk file1 file2
 a aaa (should match)
     b bbb (should match and does but the former spaces throws off cut -d, resulting in incomplete output of line in the newFile.txt)

If you have more characters than just alpha-numerics and _ that you consider part of a "word" then just change [^[:alnum:]_] to include them, e.g. if a "word" can contain . and - then change it to [^[:alnum:]_.-]

I would have leaned towards using awk, in this case I've used paste command I guess you can use something like this regex:

pattern=$(paste -sd'|' file1)
grep -E "^[[:space:]]*\S+[[:space:]]+($pattern)\b" file2 > newfile.txt

if I understand you correctly... This should extract lines from file2 where the second column exactly matches one of the file1 entries even with leading spaces

发布评论

评论列表(0)

  1. 暂无评论