javascript - insert html elements into string of text to match another string of html

have two files pdf and html, reading the files in as strings of plain text (after extracting text from pdf) and html and now trying to make plain text have same html tags as html string. Then to compare them to find differences

Final Edit of simple example not currently working

var text1="here is example text";

var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";

var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";

var content=  text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
   for(i=0; i<pdfwords.length; i++){
      if(pdfwords[i]===content[j]){

         output+=alltags[i]+pdfwords[i];
      }
    }
}

document.write(output);

output should be

"<html><body><div>here is another<span>example</span> text</div></body></html>"

diff these two strings output and text2 shows difference as "another" is inserted

Final Edit of simple example not currently working

var text1="here is example text";

var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";

var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";

var content=  text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
   for(i=0; i<pdfwords.length; i++){
      if(pdfwords[i]===content[j]){

         output+=alltags[i]+pdfwords[i];
      }
    }
}

document.write(output);

output should be

"<html><body><div>here is another<span>example</span> text</div></body></html>"

diff these two strings output and text2 shows difference as "another" is inserted

Share Improve this question edited May 27, 2016 at 17:21 asked May 16, 2016 at 23:44 0101 1,0763 gold badges14 silver badges25 bronze badges

why don't you strip html tags and just compare plain text? – Jakob Commented May 17, 2016 at 0:46
That does not work because pdf text placement can be lines away from html text so it does not match, I already tried that and html text has a extra section as well – 0101 Commented May 17, 2016 at 0:49
1 The html provided contains large amount of css, javascript tags, inline javascript, meta tag and html comment. So if you want to keep them and compare, that is impossible to create some automatic algorithm to do it. So first, you have to specify what exactly you want compare, Because comparing <script language="JavaScript" src="./javatest_files/metrics_group1.js"></script> is meaningless in this situation – lhrec_106 Commented May 17, 2016 at 3:28
Ignore the script tags, only focus on markup so html tags and inline css – 0101 Commented May 17, 2016 at 3:29
What is the extra section you mentioned in you tried strip tags and comparing – lhrec_106 Commented May 17, 2016 at 3:31

| Show 8 more comments

4 Answers 4

Sorted by: Reset to default 7 +25

This is a simple solution of what you want, it is a dynamic solution as it will handle any tags found and compare only the text content. The findDiff() will find the difference and call the callback function with the output and a array of different words as parameters.

JSFiddle: https://jsfiddle.net/9svuc7om/18/

/**
 * Parse and construct an Array of PDF text tokens
 * @params {string} text   The PDF text to be parsed
 * @return {object}         The parsed Array of tokens
 */
function parsePDFText(text) {
    var token = text.split(' ');
    for (var i=0,l=token.length; i<l; i++) {
        // remove token of first space and consecutive space
        if (token[i] == '') {
            token.splice(i, 1);
        }
    }
    return token;
}

/**
 * Return the minimum indexOf among all the arguments
 * @params {...number} index  The indexOf
 * @return {number}           The minimum indexOf, -1 if all arguments are -1
 */
function findMinIndex() {
    var min;
    for (var i = 0, l = arguments.length; i < l; i++) {
        // indexOf() returns -1 if not found
        if (arguments[i] === -1) {
            continue;
        }
        if (typeof min === 'undefined' || arguments[i] < min) {
            min = arguments[i];
        }
    }
    return min || -1;
}

/**
 * Parse and construct an Array of HTML tokens
 * @params {string} text   The HTML text to be parsed
 * @return {object}       The parsed Array of tokens
 */
function parseHTMLText(text) {
    var currentIndex = 0,
        tl = text.length,
        tokens = [],
        token, firstChar, endPos;
    while (currentIndex < tl) {
        // determine the next token type
        firstChar = text.charAt(currentIndex);
        if (firstChar == '<') {
            // a tag
            // find the position of closing tag, assume all tags are well formed
            endPos = text.indexOf('>', currentIndex + 1) + 1;
            token = {
                type: 'tag',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        } else if (firstChar == ' ') {
            // a space
            token = {
                type: 'space', 
                content: ' ', 
                valid: true
            }
            currentIndex++;
        } else {
            // a character, possibliy part of a word
            // find the end of the word
            // assume a word is delimitered either by tags or space
            endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
            // endPos is `-1` if there are not delimiter anymore, end of string reached
            if (endPos === -1) {
                endPos = tl;
            }
            token = {
                type: 'text',
                content: text.slice(currentIndex, endPos), 
                valid: true
            }
            currentIndex = endPos;
        }
        tokens.push(token);
    }
    return tokens;
}

/**
 * Find the difference between pdf text and html text and pass the output and differenc to a callback function
 * @params {string} pdfText     The pdf text
 * @params {string} htmlText    The html text
 * @params {function} callback  The callback function
 */
function findDiff(pdfText, htmlText, callback) {
    var output = '', // the final output
        diff = [], // the array of different words
        pdfTokens = parsePDFText(pdfText),
        htmlTokens = parseHTMLText(htmlText), 
        j=0, hl=htmlTokens.length;
    // the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
    for (var i=0,pl=pdfTokens.length; i<pl; i++) {
        // find the first occurrence of the pdf text
        for(; j<hl; j++) {
            if (htmlTokens[j].type != 'text') {
                // exclude comparison to non-text
                continue;
            }
            // check if the two text matches
            if (htmlTokens[j].content == pdfTokens[i]) {
                // a match is found
                j++;
                break;
            } else {
                // push the different html token into `diff` array
                diff.push(htmlTokens[j].content);
                // set the `valid` field of token to false
                htmlTokens[j].valid = false;
            }
        }
    }
    // invalidate the rest of the html text
    for(; j<hl; j++) {
        if (htmlTokens[j].type == 'text') {
            htmlTokens[j].valid = false;
        }
    }
    // concat the final string to output
    for (j=0; j<hl; j++) {
        if (htmlTokens[j].valid) {
            output += htmlTokens[j].content;
        }
    }
    callback(output, diff);
}

And you can call the function by using

findDiff(text1, text2, function(output, diff) {
    console.log(output);
    console.log(diff);
});

However, there are some limitations in this solution

It assumes all the content in pdf are present in the HTML text
It only handles <> and space, if there are other possible delimiter, e.g. tabs, extra code is needed
It assumes all tags are well-formed, and there will not be closing tags in between text content(if you need you should use > < instead)
The function is a simplified solution and is not fully tested. You cannot expect any warranty from it and some adaptations is needed. I would suggest providing only the content inside body or even a narrower range instead of the whole HTML file (if in your case it is possible) because there will be too much variations in the content of a HTML file.

The easiest way is

var s="Hello everyone on stackoverflow"
var s_split = s.split(' ');
var y = '<html><head></head><body><div>' + s_split[0] + '<span>' + s_split[1] + '</span>' + s_split[2]+' ' + s_split[3] + '</div></body></html>';

Check the jsfiddle

Why not simply strip the html tags and compare the text.

var s = "Hello everyone on stackoverflow";

var y = "<html><head><head><body><div>Hello<span>everyone</span>on stackoverflow</div></body></html>";

//using regular expressions match HTML tags and replace them with empty string. Make sure to trim the output so that the extra whitespaces at either end are removed.
var z = y.replace(/(<([^>]+)>)/ig, ' ').trim();

//compare if the stripped string matches the other string.
if(z == s) {
    s = y;  
}
alert(s);

fiddle

If you have to wrap specific word or text then search and replace it something like this:

var f = "Hello everyone on stackoverflow";
var o = "Hello";
var e = "everyone on";
var s = "stackoverflow";

if (f.indexOf(e) >= 0) {
    var h = f.replace(e,"<strong>"+e+"</strong>");
}else{
    var h = f;
}
if (h.indexOf(s) >= 0){
    var h = h.replace(s,"<em>"+s+"</em>");
}
if (h.indexOf(o) >= 0){
    var h = h.replace(o,"<u>"+o+"</u>");
}

$('body').append('<div>'+h+'</div>');

Example here: https://jsfiddle.net/jwqrgsL1/1/

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

javascript - insert html elements into string of text to match another string of html - Stack Overflow

4 Answers 4

与本文相关的文章

评论列表(0)