最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

javascript - nodejs web scraper for password protected website - Stack Overflow

programmeradmin0浏览0评论

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl

curl -d "username=myuser&password=mypw&submit=Login" URL

Here is my code...

var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){
url = 'myURL'

request(url, function(error, response, html){

    // check errors
    if(!error){
        // Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
        var $ = cheerio.load(html);

        var title, release, rating;
        var json = { title : "", release : "", rating : ""};

        $('.span8 b').filter(function(){
            // Let's store the data we filter into a variable so we can easily see what's going on.
            var data = $(this);
            title = data.first().text();
            release = data.text();

            json.title = title;
            json.release = release;
        })
    }
    else{
        console.log("Error occurred: " + error);
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

        console.log('File successfully written! - Check your project directory for the output.json file');

    })

    res.send('Check your console!')
})

})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

I have tried the following...

var request = require('request',
    username:'myuser',
    password:'mypw');

This just returns the authentication page's HTML

request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
    ...
    ...
    ...
}

This also just returns the authentication page's HTML

So my question is how do I achieve this using nodejs?

I am trying to scrape a website using nodejs and it works perfectly on sites that do not require any authentication. But whenever I try to scrape a site with a form that requires username and password I only get the HTML from the authentication page (that is, if you would click 'view page source' on the authentication page it self, that is the HTML I get). I am able to get the desired HTML using curl

curl -d "username=myuser&password=mypw&submit=Login" URL

Here is my code...

var express = require('express');
var fs = require('fs'); //access to file system
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){
url = 'myURL'

request(url, function(error, response, html){

    // check errors
    if(!error){
        // Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
        var $ = cheerio.load(html);

        var title, release, rating;
        var json = { title : "", release : "", rating : ""};

        $('.span8 b').filter(function(){
            // Let's store the data we filter into a variable so we can easily see what's going on.
            var data = $(this);
            title = data.first().text();
            release = data.text();

            json.title = title;
            json.release = release;
        })
    }
    else{
        console.log("Error occurred: " + error);
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

        console.log('File successfully written! - Check your project directory for the output.json file');

    })

    res.send('Check your console!')
})

})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

I have tried the following...

var request = require('request',
    username:'myuser',
    password:'mypw');

This just returns the authentication page's HTML

request({form: {username:myuser, password:mypw, submit:Login}, url: myURL}, function(error, response, html){
    ...
    ...
    ...
}

This also just returns the authentication page's HTML

So my question is how do I achieve this using nodejs?

Share Improve this question edited Jul 23, 2024 at 10:39 VLAZ 29.2k9 gold badges63 silver badges85 bronze badges asked Dec 23, 2014 at 14:03 gthb7gthb7 872 silver badges5 bronze badges
Add a ment  | 

1 Answer 1

Reset to default 3

you shouldn't use .get but .post and put the post param (username and password) in your call

request.post({
  headers: {'content-type' : 'application/x-www-form-urlencoded'},
  url:     url,
  body:    "username=myuser&password=mypw&submit=Login"
}, function(error, response, html){
    //do your parsing... 
    var $ = cheerio.load(html)
});
发布评论

评论列表(0)

  1. 暂无评论