最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

javascript - Convert pdf to html files using node.js and pdf.js - Stack Overflow

programmeradmin3浏览0评论

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

I want to convert pdf to html pages using pdf.js. Pdf.js does that in a browser but is it possible to get those html pages rendered by browser in backend thus converting a pdf of n pages to n number of html files. I am using node.js as backend. I have tried pdf2html and other similar npm modules, they don't work great and have issues with some pdfs. Thank you for suggestions.

Share Improve this question asked Jun 10, 2019 at 12:52 last_fixlast_fix 3791 gold badge4 silver badges20 bronze badges 3
  • Your Solution is here-> bytescout./articles/… – Anand Choudhary Commented Jun 10, 2019 at 12:57
  • this is not free! :( – last_fix Commented Jun 10, 2019 at 13:04
  • pdf.js convert pdf to image (canvas, png etc. ). It won't convert PDF to HTML. – shaochuancs Commented Jun 10, 2019 at 15:05
Add a ment  | 

1 Answer 1

Reset to default 3

Maybe I found something similar - I am working with local PDF file and browser. I made small changes in ready made viewer.js / PDF.js, it should be possible to process using both Node.js & browser.

This script include's PDF specified by argument to viewer.js Webpack and start browser.

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer.js');
    let wp = fs.readFileSync(viewerJSpath, 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath, wp, 'utf-8');
    const c = path.join(__dirname, 'viewer.html');
    chp.execSync(c);
});

Then tried to add original width as next style parameter to renderTextLayer's appendText method and elements sort by position to TextLayerBuilder's render method next2 this.textLayerDiv.appendChild(textLayerFrag);.

All mentioned PDF.js changes on my Github it seems only web and build folders are required (except npm i -g datauri fox example).

Readability improvements ? #12512 PR

Console script / bookmarklet


Using puppeteer and slightly modified PDF.js it is possible to convert directly (works both head/less, but element sizes slightly differ)

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();
});

Fixes required for puppeteer/chromium:

const message = exception?.message; // => exception.message
page: this.pageLabel ?? this.id // => this.pageLabel || this.id

viewer.js => viewerSrc.js basic additions:

function webViewerPageRendered({
...
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    }); // generate all remaining pages
  }
}

class BaseViewer {
  constructor(options) {
    this.pageNo = []; // rendered pages array
...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
    if (this.pageNo.indexOf(val) < 0) {
      this.pageNo.push(val);
    }
    if (this.pagesCount - 1 <= this.pageNo.length) {
      window.reader(elLists); // sent result back 2 node.js
    }

And result looks like {PageNo:{ElNo:{data}, ...}, ...} and could be simply translated to web page or further processed.

{
    "1": {
        "0": {
            "x": 99.9871,
            "y": 98.0496,
            "w": 557.695,
            "h": 22,
            "text": "Trace-based Just-in-Time Type Specialization for Dynamic",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 99.9871px; top: 98.0496px; width: 557.695px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.970163);"
        },
        "1": {
            "x": 327.478,
            "y": 122.793,
            "w": 102.707,
            "h": 22,
            "text": "Languages",
            "ff": "sans-serif",
            "fs": "22.2695px",
            "cssText": "left: 327.478px; top: 122.793px; width: 102.707px; font-size: 22.2695px; font-family: sans-serif; transform: scaleX(0.932262);"
        },
...
    "2": {
        "0": {
            "x": 393.677,
            "y": 90.3408,
            "w": 192.909,
            "h": 11,
            "text": "1 for (var i = 2; i < 100; ++i) {",
            "ff": "monospace",
            "fs": "11.1347px",
            "cssText": "left: 393.677px; top: 90.3408px; width: 192.909px; font-size: 11.1347px; font-family: monospace; transform: scaleX(0.875232);"
        },
        "1": {
            "x": 67.0588,
            "y": 91.7599,
            "w": 173.346,
            "h": 11,
            "text": "Hence, recording and piling a trace",
            "ff": "sans-serif",
            "fs": "11.1347px",
            "cssText": "left: 67.0588px; top: 91.7599px; width: 173.346px; font-size: 11.1347px; font-family: sans-serif; transform: scaleX(0.895175);"
        },






Summary of changes (in original gh-pages branch):

- changes in PDF.js

  function appendText(task, geom, styles) {
...
    let left, top;
+++              , width;
...
    if (angle === 0) {
      left = tx[4];
      top = tx[5] - fontAscent;
    } else {
      left = tx[4] + fontAscent * Math.sin(angle);
      top = tx[5] - fontAscent * Math.cos(angle);
    }
+++ width = geom.width * task._viewport.transform[0];

    textDiv.style.left = `${left}px`;
    textDiv.style.top = `${top}px`;
+++ textDiv.style.width = `${width}px`;

- new nodeView.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const chp = require('child_process');
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const viewerJSpath = path.join(__dirname, './viewer');
const content = datauri(pdf);
let wp = fs.readFileSync(viewerJSpath, 'utf-8');
const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
const srcPos = [wp.indexOf(pdfName)];
srcPos.push(srcPos[0] + pdfName.length);
let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
wp = wp.substr(0, srcPos[0]) + content +
wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
wp.substr(HOSTED_VIEWER_ORIGINS);
fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
const c = path.join(__dirname, 'viewer.ff');
chp.execSync(c);

- new openFF.bat: start node nodeView.js %1

- new pdf2sortedMergedTexts.js

const fs = require('fs');
const path = require('path');
const pdf = require('process').argv[2];
const datauri = require(path.join(process.env.APPDATA, 'npm/node_modules', 'datauri'));
const puppeteer = require(path.join(process.env.APPDATA, 'npm/node_modules', 'puppeteer'));
datauri(pdf, (err, content, meta) => {
    if (err) {
        throw err;
    }
    const viewerJSpath = path.join(__dirname, './viewer');
    let wp = fs.readFileSync(viewerJSpath + 'Src.js', 'utf-8');
    const pdfName = 'pressed.tracemonkey-pldi-09.pdf';
    const srcPos = [wp.indexOf(pdfName)];
    srcPos.push(srcPos[0] + pdfName.length);
    let HOSTED_VIEWER_ORIGINS = wp.indexOf('HOSTED_VIEWER_ORIGINS');
    HOSTED_VIEWER_ORIGINS = wp.indexOf(']', HOSTED_VIEWER_ORIGINS);
    wp = wp.substr(0, srcPos[0]) + content +
    wp.substr(srcPos[1], HOSTED_VIEWER_ORIGINS - srcPos[1]) + ', "file://"' +
    wp.substr(HOSTED_VIEWER_ORIGINS);
    fs.writeFileSync(viewerJSpath + '.js', wp, 'utf-8');
    (async () => {
        const browser = await puppeteer.launch({
            // headless: false
        });
        const page = await browser.pages();
        const c = path.join(__dirname, 'viewer.html');
        await page[0].goto('file:///' + c);
        page[0].exposeFunction('reader', (elLists) => {
            fs.writeFileSync(path.join(__dirname, 'PDFtexts.txt'), JSON.stringify(elLists, null, 4));
            setTimeout(() => { browser.close(); }, 100);
        });
    })();

});

- changed viewer.js -> viewerSrc.js

function webViewerPageRendered({
... +++
  if (pageNumber < PDFViewerApplication.pagesCount) {
    arguments[0].source.eventBus.dispatch("pagenumberchanged", {
      value: pageNumber + 1
    });
  }
}
...
class BaseViewer {
  constructor(options) {
+++ this.pageNo = [];

...
  _setCurrentPageNumber(val, resetCurrentPageView = false) {
...
+++ if (this.pageNo.indexOf(val) < 0) {
+++   this.pageNo.push(val);
+++   console.log(this.pageNo);
+++ }
+++ if (this.pagesCount - 1 <= this.pageNo.length) {
+++   window.reader(elLists);
+++ }

    this._currentPageNumber = val;

  render(timeout = 0) {
...
    this.textLayerRenderTask.promise.then(() => {
      this.textLayerDiv.appendChild(textLayerFrag);
+++   this.reorder(this.textLayerDiv);

... new
  reorder(_src) {
    const src = _src.children;
    let els = [];
    const elDest = [];
    for (let j = 0; j < src.length; j++) {
        const i = src[j];
        if (i.className === 'endOfContent') continue;
        els.push({ x: parseFloat(i.style.left), y: parseFloat(i.style.top), w: parseFloat(i.style.width), h: i.offsetHeight, text: i.innerText, ff: i.style.fontFamily, fs: i.style.fontSize, cssText: i.style.cssText });
    }
    els.sort((a, b) => {
      if (Math.abs(a.y - b.y) <= 1) {
          if (Math.abs(a.x - b.x) <= 1) return 0;
          else return a.x - b.x;
      } else return a.y - b.y;
    });
    let elMin = els[0];
    for (let i = 1; i < els.length; i++) {
         if (elMin.x + elMin.w + 1 >= els[i].x &&
          Math.abs(elMin.y - els[i].y) < 1 &&
          elMin.h === els[i].h &&
          elMin.ff === els[i].ff &&
          elMin.fs === els[i].fs) {
            elMin.text += els[i].text;
            elMin.w = els[i].x + els[i].w - elMin.x;
            if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
            continue;
        }
        if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
        elMin = els[i];
    }
    if (elDest[elDest.length - 1] !== elMin) elDest.push(elMin);
    els = _src;
    while (els.lastChild) els.removeChild(els.lastChild);
    const elList = [];
    if (window.elLists === undefined) window.elLists = {};
    const uqIdx = { x: [], y: [] };
    for (let i = 0; i < elDest.length; i++) {
        const o = document.createElement('DIV');
        o.innerHTML = elDest[i].text;
        o.setAttribute('style', elDest[i].cssText + 'width:' + elDest[i].w + 'px;position:absolute;');
        els.appendChild(o);
        elList.push([elDest[i].x, elDest[i].x + elDest[i].w, o, elDest[i].y, elDest[i].y + elDest[i].h, elDest[i].text]);
        if (uqIdx.x.indexOf(elDest[i].x) < 0) uqIdx.x.push(elDest[i].x);
        if (uqIdx.y.indexOf(elDest[i].y) < 0) uqIdx.y.push(elDest[i].y);
    }
    elLists[_src.parentElement.getAttribute("data-page-Number")] = Object.assign({}, elDest);
  }

- changed viewer.css

+++ input{padding:0px;border:1px solid #e0e0e0;}input:focus{background-color:red;}
发布评论

评论列表(0)

  1. 暂无评论