return FALSE; $r = well_tag_thread__update(array('id' => $id), $update); return $r; } function well_tag_thread_find($tagid, $page, $pagesize) { $arr = well_tag_thread__find(array('tagid' => $tagid), array('id' => -1), $page, $pagesize); return $arr; } function well_tag_thread_find_by_tid($tid, $page, $pagesize) { $arr = well_tag_thread__find(array('tid' => $tid), array(), $page, $pagesize); return $arr; } ?>JavaScript regex matching word in sentences - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

JavaScript regex matching word in sentences - Stack Overflow

programmeradmin3浏览0评论

What should be the regex for matching a specific word in every sentence in JavaScript?

The rules for matching the sentence are clear: It should end with dot (.) and the next letter should be capital.

But what I need to achieve is match a word in each sentence. So I suppose I should use groups. Or should I put the string word within the regex?

Here is my java regex for looping the sentences enter link

Here is my java regex for matching words in -5 +5 word context: enter link But I will need to have a bination of both in JavaScript.

My goal:

Input:

Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island. No serious damage or fatalities were reported in the Valentine's Day quake that struck at 13:13 local time. Based on the med. report everybody were ok.

Output for chosen word "on":

  1. Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island
  2. Based on the med. report everybody were ok.

What should be the regex for matching a specific word in every sentence in JavaScript?

The rules for matching the sentence are clear: It should end with dot (.) and the next letter should be capital.

But what I need to achieve is match a word in each sentence. So I suppose I should use groups. Or should I put the string word within the regex?

Here is my java regex for looping the sentences enter link

Here is my java regex for matching words in -5 +5 word context: enter link But I will need to have a bination of both in JavaScript.

My goal:

Input:

Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island. No serious damage or fatalities were reported in the Valentine's Day quake that struck at 13:13 local time. Based on the med. report everybody were ok.

Output for chosen word "on":

  1. Cliffs have collapsed in New Zealand during an earthquake in the city of Christchurch on the South Island
  2. Based on the med. report everybody were ok.
Share Improve this question asked Feb 14, 2016 at 15:33 user2670818user2670818 7695 gold badges12 silver badges28 bronze badges 5
  • How about this text: Mr. Smith lives down the street. – anubhava Commented Feb 14, 2016 at 15:36
  • Yea, it will not work for some cases, but for my case will be ok for now. – user2670818 Commented Feb 14, 2016 at 15:36
  • Try LoL, str.replace(/\.\s*([A-Z])/g, '.LolLolLol$1').split('LolLolLol'); – Tushar Commented Feb 14, 2016 at 15:44
  • @Tushar You should post that as an answer. It's short and works really well. – Linus Oleander Commented Feb 14, 2016 at 19:08
  • @user2670818 Consider accepting my answer or giving feedback if its wrong. Thanks. – Linus Oleander Commented Feb 14, 2016 at 20:31
Add a ment  | 

1 Answer 1

Reset to default 4

Update: I provide two solutions below. My original answer only provided the first.

  1. One solution uses a single regex to try to parse the entire original paragraph. It can be done, but as described below, may not be the best solution.

  2. An alternative solution is a more involved algorithm, but uses lighter regex's. It splits the text into sentences and works on each sentence separately. This solution is much more efficient and, might I say, more elegant.

Solution 1: Single Regex

Run the first code snippet below to demo this solution. It finds all sentences (as you defined them) that contain any keyword you want. The plete regex is...

\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))

...but the code breaks it down into much more understandable pieces.

Once you click the 'Run code snippet' button, it takes a few seconds to run.

This is a fairly regex-heavy solution. It can be fairly slow. Using the example paragraph you provided, this routine bees intolerably slow. Even being this slow, it is actually not plex enough, as it can't tell when the keyword is embedded in another word. (e.g. when looking for "cats" it will also find "catsup"). Trying to avoid that sort of embedding is possible, but it just made the whole thing too slow to even demonstrate.

var text = "I like cats. I really like cats. I also like dogs. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";

var keyword = "cats";

var reStr =
  "\. +"                   + // a preceding sentence-ender, i.e. a period
                             //   followed by one or more spaces
  "("                      + // begin remembering the match (i.e. arr[1] below)
    "[A-Z]"                + // a sentence-starter, i.e. an uppercase letter
    "("                    + // start of a sentence-continuer, which is either
      "[^.]"               + // anything but a period
      "|"                  + // or
      "\.(?! +[A-Z])"      + // a period not followed by one or more spaces
                             //   and an uppercase letter
    ")"                    + // end of a sentence-continuer
    "*?"                   + // zero or more of the preceding sentence-continuers
                             //   but as few as possible
    keyword                + // the keyword being sought
    "([^.]|\.(?! +[A-Z]))" + // a sentence-continuer, as described above
    "*?"                   + // zero or more of them but as few as possible
    "\."                   + // a sentence-ender, i.e. a period
    "(?= +[A-Z])"          + // followed by one or more spaces and an
                             //   uppercase letter, which is not remembered
  ")";                       // finish remembering the match

// That ends up being the following:
// "\. +([A-Z]([^.]|.(?! +[A-Z]))*?" + keyword + "([^.]|.(?! +[A-Z]))*?\.(?= +[A-Z]))"


var re = new RegExp(reStr, "g"); // construct the regular expression

var sentencesWithKeyword = []; // initialize an array to keep the hits
var arr; // prepare an array to temporarily keep 'exec' return values
var expandedText = ". " + text + " A";
// add a sentence-ender (i.e. a period) before the text
//   and a sentence-starter (i.e. an uppercase letter) after the text
//   to facilitate finding the first and last sentences

while ((arr = re.exec(expandedText)) !== null) { // while hits are found
  sentencesWithKeyword.push(arr[1]); // remember the sentence found
  re.lastIndex -= 2; // start the next search two characters back
                     //   to allow for starting the next match
                     //   with the period that ended the current match
}

// show the results
show("Text to search:");
show(text);
show("Query string: " + keyword);
show("Hits:");
for (var num = 0; num < sentencesWithKeyword.length; num += 1) {
  show((num + 1) + ". " + sentencesWithKeyword[num]);
}

function show(msg) {
  document.write("<p>" + msg + "</p>");
}

Solution 2: Divide and Conquer

Here, you do the following:

  • split the original text into an array of sentence elements
  • search each sentence for the keyword
  • keep those have the keyword, discard those that don't

That way, any regex's you use do not have to simultaneously deal with splitting into sentences, searching for the keyword, keeping hits and discarding non-hits, all in one massive regex.

var textToSearch = "I like cats. I really like cats. I also like dogs. Cats are great.  Catsup is tasty. Dogs and cats are pets. Approx. half of pets are cats. Approx. half of pets are dogs. Some cats are v. expensive.";

var keyword = "cats";

var sentences = {
  all           : [],
  withKeyword   : [],
  withNoKeyword : []
}

var sentenceRegex = new RegExp("([.]) +([A-Z])", "g");
var sentenceSeparator = "__SENTENCE SEPARATOR__";
var modifiedText = textToSearch.replace(sentenceRegex, "$1" + sentenceSeparator + "$2");
sentences.all = modifiedText.split(sentenceSeparator);

sentences.all.forEach(function(sentence) {
  var keywordRegex = new RegExp("(^| +)" + keyword + "( +|[.])", "i");
  var keywordFound = keywordRegex.test(sentence);
  if (keywordFound) {
    sentences.withKeyword.push(sentence);
  } else {
    sentences.withNoKeyword.push(sentence);
  }
});

document.write("<pre>" + JSON.stringify(sentences, null, 2) + "</pre>");

发布评论

评论列表(0)

  1. 暂无评论