have two files pdf and html, reading the files in as strings of plain text (after extracting text from pdf) and html and now trying to make plain text have same html tags as html string. Then to compare them to find differences
Final Edit of simple example not currently working
var text1="here is example text";
var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";
var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";
var content= text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
for(i=0; i<pdfwords.length; i++){
if(pdfwords[i]===content[j]){
output+=alltags[i]+pdfwords[i];
}
}
}
document.write(output);
output should be
"<html><body><div>here is another<span>example</span> text</div></body></html>"
diff these two strings output and text2 shows difference as "another" is inserted
have two files pdf and html, reading the files in as strings of plain text (after extracting text from pdf) and html and now trying to make plain text have same html tags as html string. Then to compare them to find differences
Final Edit of simple example not currently working
var text1="here is example text";
var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";
var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";
var content= text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
for(i=0; i<pdfwords.length; i++){
if(pdfwords[i]===content[j]){
output+=alltags[i]+pdfwords[i];
}
}
}
document.write(output);
output should be
"<html><body><div>here is another<span>example</span> text</div></body></html>"
diff these two strings output and text2 shows difference as "another" is inserted
Share Improve this question edited May 27, 2016 at 17:21 0101 asked May 16, 2016 at 23:44 01010101 1,0763 gold badges14 silver badges25 bronze badges 13 | Show 8 more comments4 Answers
Reset to default 7 +25This is a simple solution of what you want, it is a dynamic solution as it will handle any tags found and compare only the text content. The findDiff()
will find the difference and call the callback function with the output and a array of different words as parameters.
JSFiddle: https://jsfiddle.net/9svuc7om/18/
/**
* Parse and construct an Array of PDF text tokens
* @params {string} text The PDF text to be parsed
* @return {object} The parsed Array of tokens
*/
function parsePDFText(text) {
var token = text.split(' ');
for (var i=0,l=token.length; i<l; i++) {
// remove token of first space and consecutive space
if (token[i] == '') {
token.splice(i, 1);
}
}
return token;
}
/**
* Return the minimum indexOf among all the arguments
* @params {...number} index The indexOf
* @return {number} The minimum indexOf, -1 if all arguments are -1
*/
function findMinIndex() {
var min;
for (var i = 0, l = arguments.length; i < l; i++) {
// indexOf() returns -1 if not found
if (arguments[i] === -1) {
continue;
}
if (typeof min === 'undefined' || arguments[i] < min) {
min = arguments[i];
}
}
return min || -1;
}
/**
* Parse and construct an Array of HTML tokens
* @params {string} text The HTML text to be parsed
* @return {object} The parsed Array of tokens
*/
function parseHTMLText(text) {
var currentIndex = 0,
tl = text.length,
tokens = [],
token, firstChar, endPos;
while (currentIndex < tl) {
// determine the next token type
firstChar = text.charAt(currentIndex);
if (firstChar == '<') {
// a tag
// find the position of closing tag, assume all tags are well formed
endPos = text.indexOf('>', currentIndex + 1) + 1;
token = {
type: 'tag',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
} else if (firstChar == ' ') {
// a space
token = {
type: 'space',
content: ' ',
valid: true
}
currentIndex++;
} else {
// a character, possibliy part of a word
// find the end of the word
// assume a word is delimitered either by tags or space
endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
// endPos is `-1` if there are not delimiter anymore, end of string reached
if (endPos === -1) {
endPos = tl;
}
token = {
type: 'text',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
}
tokens.push(token);
}
return tokens;
}
/**
* Find the difference between pdf text and html text and pass the output and differenc to a callback function
* @params {string} pdfText The pdf text
* @params {string} htmlText The html text
* @params {function} callback The callback function
*/
function findDiff(pdfText, htmlText, callback) {
var output = '', // the final output
diff = [], // the array of different words
pdfTokens = parsePDFText(pdfText),
htmlTokens = parseHTMLText(htmlText),
j=0, hl=htmlTokens.length;
// the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
for (var i=0,pl=pdfTokens.length; i<pl; i++) {
// find the first occurrence of the pdf text
for(; j<hl; j++) {
if (htmlTokens[j].type != 'text') {
// exclude comparison to non-text
continue;
}
// check if the two text matches
if (htmlTokens[j].content == pdfTokens[i]) {
// a match is found
j++;
break;
} else {
// push the different html token into `diff` array
diff.push(htmlTokens[j].content);
// set the `valid` field of token to false
htmlTokens[j].valid = false;
}
}
}
// invalidate the rest of the html text
for(; j<hl; j++) {
if (htmlTokens[j].type == 'text') {
htmlTokens[j].valid = false;
}
}
// concat the final string to output
for (j=0; j<hl; j++) {
if (htmlTokens[j].valid) {
output += htmlTokens[j].content;
}
}
callback(output, diff);
}
And you can call the function by using
findDiff(text1, text2, function(output, diff) {
console.log(output);
console.log(diff);
});
However, there are some limitations in this solution
- It assumes all the content in pdf are present in the HTML text
- It only handles
<>
and space, if there are other possible delimiter, e.g. tabs, extra code is needed - It assumes all tags are well-formed, and there will not be closing tags in between text content(if you need you should use
>
<
instead) - The function is a simplified solution and is not fully tested. You cannot expect any warranty from it and some adaptations is needed. I would suggest providing only the content inside
body
or even a narrower range instead of the whole HTML file (if in your case it is possible) because there will be too much variations in the content of a HTML file.
The easiest way is
var s="Hello everyone on stackoverflow"
var s_split = s.split(' ');
var y = '<html><head></head><body><div>' + s_split[0] + '<span>' + s_split[1] + '</span>' + s_split[2]+' ' + s_split[3] + '</div></body></html>';
Check the jsfiddle
Why not simply strip the html tags and compare the text.
var s = "Hello everyone on stackoverflow";
var y = "<html><head><head><body><div>Hello<span>everyone</span>on stackoverflow</div></body></html>";
//using regular expressions match HTML tags and replace them with empty string. Make sure to trim the output so that the extra whitespaces at either end are removed.
var z = y.replace(/(<([^>]+)>)/ig, ' ').trim();
//compare if the stripped string matches the other string.
if(z == s) {
s = y;
}
alert(s);
fiddle
If you have to wrap specific word or text then search and replace it something like this:
var f = "Hello everyone on stackoverflow";
var o = "Hello";
var e = "everyone on";
var s = "stackoverflow";
if (f.indexOf(e) >= 0) {
var h = f.replace(e,"<strong>"+e+"</strong>");
}else{
var h = f;
}
if (h.indexOf(s) >= 0){
var h = h.replace(s,"<em>"+s+"</em>");
}
if (h.indexOf(o) >= 0){
var h = h.replace(o,"<u>"+o+"</u>");
}
$('body').append('<div>'+h+'</div>');
Example here: https://jsfiddle.net/jwqrgsL1/1/
<script language="JavaScript" src="./javatest_files/metrics_group1.js"></script>
is meaningless in this situation – lhrec_106 Commented May 17, 2016 at 3:28