node.js - Getting all images from a webpage and save the to disk programmatically (NodeJS & Javascript)

I need to get a lot of images from a few websites and download them to my disk so that I can use them (will upload them to a blob (azure) and then save the link to my DB).

GETTING THE IMAGES

I know how to get the images from the html with JS, for example one of them I would make a for-loop and do:

document.getElementsByClassName('person')[i].querySelector('div').querySelector('img').getAttribute('src')

And there I would have the links to all the images.

SAVING THE IMAGES

I also saw that I can save the files to disk using node and the fs module, by doing:

function saveImageToDisk(url, localPath) {var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});

}

HOW TO PUT IT ALL TOGETHER

This is where I am stuck, I don't know exactly how to connect the two parts (the script and the nodejs code), I want to get the image and also the image name (alt tag in this case) and then use them in node to upload the image to a blob and put them name and image blob url in my DB.

I thought I could download the html page and then put the JS script on the bottom of the body but then I don't know how to pass the url to the nodejs code.

How can I do this?

I am not very used to using scripts, I mostly used node without them and I get a bit confused by their interactions and how to connect js scripts to my code.

Also is this the best way to go about this or is there a simpler/better way I am not seeing?

I need to get a lot of images from a few websites and download them to my disk so that I can use them (will upload them to a blob (azure) and then save the link to my DB).

GETTING THE IMAGES

I know how to get the images from the html with JS, for example one of them I would make a for-loop and do:

document.getElementsByClassName('person')[i].querySelector('div').querySelector('img').getAttribute('src')

And there I would have the links to all the images.

SAVING THE IMAGES

I also saw that I can save the files to disk using node and the fs module, by doing:

function saveImageToDisk(url, localPath) {var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});

}

HOW TO PUT IT ALL TOGETHER

I thought I could download the html page and then put the JS script on the bottom of the body but then I don't know how to pass the url to the nodejs code.

How can I do this?

I am not very used to using scripts, I mostly used node without them and I get a bit confused by their interactions and how to connect js scripts to my code.

Also is this the best way to go about this or is there a simpler/better way I am not seeing?

Share Improve this question edited Feb 28, 2020 at 4:36 Cœur 38.8k25 gold badges205 silver badges277 bronze badges asked Jan 30, 2019 at 14:31 Jack 49110 silver badges28 bronze badges

Take a look at Google's Puppeteer, with it you can access and take data from a website via node github./GoogleChrome/puppeteer – Gabriel Carneiro Commented Jan 30, 2019 at 14:37
Use ajax to send the url array from client to node server – ellipsis Commented Jan 30, 2019 at 14:41

Add a ment |

4 Answers 4

Sorted by: Reset to default 2

This feels like you should use a crawler. The following code should work (using the npm module crawler):

const Crawler = require("crawler")

const c = new Crawler({
    callback: function(error, res, done) {
        if (error) {
            console.log({error})
        } else {
            const images = res.$('.person div img')
            images.each(index => {
                // here you can save the file or save them in an array to download them later
                console.log({
                    src: images[index].attribs.src,
                    alt: images[index].attribs.alt,
                })
            })
        }
    }
})

c.queue('https://www.yoursite.')

You need a bridge between Web API (for DOM parsing etc) and Node.js API. For example, some headless browser managing tool for Node.js. Say, you can use puppeteer with this script:

'use strict';

const puppeteer = require('puppeteer');
const https = require('https');
const fs = require('fs');

(async function main() {
  try {
    const browser = await puppeteer.launch();
    const [page] = await browser.pages();

    await page.goto('https://en.wikipedia/wiki/Image');

    const imgURLs = await page.evaluate(() =>
      Array.from(
        document.querySelectorAll('#mw-content-text img.thumbimage'),
        ({ src }) => src,
      )
    );
    console.log(imgURLs);
    await browser.close();

    imgURLs.forEach((imgURL, i) => {
      https.get(imgURL, (response) => {
        response.pipe(fs.createWriteStream(`${i++}.${imgURL.slice(-3)}`));
      });
    });
  } catch (err) {
    console.error(err);
  }
})();

You can even download images just once, using pictures already downloaded by the browser. This script saves identical images, but with one session of requests, without using https Node.js module (this saves time, network traffic and server workload):

'use strict';

const puppeteer = require('puppeteer');
const fs = require('fs');

(async function main() {
  try {
    const browser = await puppeteer.launch();
    const [page] = await browser.pages();

    const allImgResponses = {};
    page.on('response', (response) => {
      if (response.request().resourceType() === 'image') {
        allImgResponses[response.url()] = response;
      }
    });

    await page.goto('https://en.wikipedia/wiki/Image');

    const selecedImgURLs = await page.evaluate(() =>
      Array.from(
        document.querySelectorAll('#mw-content-text img.thumbimage'),
        ({ src }) => src,
      )
    );
    console.log(selecedImgURLs);

    let i = 0;
    for (const imgURL of selecedImgURLs) {
      fs.writeFileSync(
        `${i++}.${imgURL.slice(-3)}`,
        await allImgResponses[imgURL].buffer(),
      );
    }

    await browser.close();
  } catch (err) {
    console.error(err);
  }
})();

I remend you to use the dom-parser module. See here: https://www.npmjs./package/dom-parser

By doing so, you can download the whole html-File with http.get() and parse it using the dom-parser. Then extract all the information you need from the HTML-File. With the Image URL, use your saveImageToDisk() function.

Following your idea, you have to add the JS script to the html-File as you mentioned. But in addition you have to use Ajax (xmlHttpRequest) to post the URL to a nodeJS-Server.

You can use Promise & inside it do the job of getting all the images and put the image url in an array.Then inside the then method you can either iterate the array and call the saveImageToDisk each time or you can send the array to the middle layer with slide modification. The second option is better since it will make only one network call

function getImages() {
  return new Promise((resolve, reject) => {
     // Array.from will create an array
     // map will return a new array with all the image url
    let k = Array.from(document.getElementsByClassName('person')[0].querySelector('div')
        .querySelectorAll('img'))
      .map((item) => {
        return item.getAttribute('src')
      })
    resolve(k)
  })
}
getImages().then((d) => {
 // it will work only after the promise is resolved
  console.log('****', d);
  (item => {
    // call saveImageToDisk function

  })

})

function saveImageToDisk(url, localPath) {
  var fullUrl = url;
  var file = fs.createWriteStream(localPath);
  var request = https.get(url, function(response) {
    response.pipe(file);
  });

<div class='person'>
  <div>
    <img src='https://www.fast-growing-trees./images/P/Leyland-Cypress-450-MAIN.jpg'>
    <img src='http://cdn.shopify./s/files/1/2473/3486/products/Cypress_Leyland_2_Horticopia_d1b5b63a-8bf7-4897-96fb-05320bf3d81b_grande.jpg?v=1532991076'>
    <img src='https://www.fast-growing-trees./images/P/Live-Oak-Tree-450w.jpg'>
    <img src='https://www.greatgardenplants./images/uploads/452_1262_popup.jpg'>
    <img src='https://shop.arborday/data/default/images/catalog/600/Turnkey/1/Leyland-Cypress_3-828.jpg'>


    <img src='https://images-na.ssl-images-amazon./images/I/51RZkKnrlSL._SX425_.jpg'>

    <img src='https://thumbs-prod.si-cdn./Z3JYiuJ96ReLq04NCT1B94sTd4E=/800x600/filters:no_upscale()/https://public-media.si-cdn./filer/06/9c/069cfb16-c46c-4742-85f0-3c7e45fa139d/mar2018_a05_talkingtrees.jpg'>


  </div>

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

node.js - Getting all images from a webpage and save the to disk programmatically (NodeJS & Javascript) - Stack Overflow

GETTING THE IMAGES

SAVING THE IMAGES

HOW TO PUT IT ALL TOGETHER

GETTING THE IMAGES

SAVING THE IMAGES

HOW TO PUT IT ALL TOGETHER

4 Answers 4

与本文相关的文章

评论列表(0)