最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

javascript - Scraping content from Google search results with request in Node.Js - Stack Overflow

programmeradmin1浏览0评论

For my Node.Js app I need to get the first page of Google search results but from the domain because I need the "People also search for" knowledge graph info, which only shows up on Google.Com.

I figured I can use the request and cheerio modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. ;site=&source=hp&q=google&oq=google Google automatically redirects me to the .de domain (as I'm based in Germany).

I tried setting it to first load url which automatically switches off country-specific redirect in browsers, but it didn't work...

Does anybody know what I could do differently to make it work?

Here's my code... Thank you!

var request = require("request");
var cheerio = require("cheerio");

function dataCookieToString(dataCookie) {
    var t = "";
    for (var x = 0; x < dataCookie.length; x++) {
        t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
    }
    return t;
}

function mkdataCookie(cookie) {
    var t, j;
    cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
    for (var x = 0; x < cookie.length; x++) {
        cookie[x] = cookie[x].split("; ");
        j = cookie[x][0].split("=");
        t = {
            key: j[0],
            value: j[1]
        };
        for (var i = 1; i < cookie[x].length; i++) {
            j = cookie[x][i].split("=");
            t[j[0]] = j[1];
        }
        cookie[x] = t;
    }

    return cookie;
}

var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));


request({
    uri: "",
    headers: {
        'User-Agent': 'Mozilla/5.0',
        "Cookie": dataCookieToString(dataCookie)
    }
}, function(error, response, body) {

    request({
        uri: ";site=&source=hp&q=google&oq=google",
        headers: {
            'User-Agent': 'Mozilla/5.0'
        }
    }, function(error, response, body) {
        console.log(body);
        var $ = cheerio.load(body);

        $(".kno-fb-ctx").each(function() {
            var link = $(this);
            var text = link.text();

            console.log(text);
        });
    });
});

For my Node.Js app I need to get the first page of Google search results but from the . domain because I need the "People also search for" knowledge graph info, which only shows up on Google.Com.

I figured I can use the request and cheerio modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google Google automatically redirects me to the .de domain (as I'm based in Germany).

I tried setting it to first load http://www.google./ncr url which automatically switches off country-specific redirect in browsers, but it didn't work...

Does anybody know what I could do differently to make it work?

Here's my code... Thank you!

var request = require("request");
var cheerio = require("cheerio");

function dataCookieToString(dataCookie) {
    var t = "";
    for (var x = 0; x < dataCookie.length; x++) {
        t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
    }
    return t;
}

function mkdataCookie(cookie) {
    var t, j;
    cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
    for (var x = 0; x < cookie.length; x++) {
        cookie[x] = cookie[x].split("; ");
        j = cookie[x][0].split("=");
        t = {
            key: j[0],
            value: j[1]
        };
        for (var i = 1; i < cookie[x].length; i++) {
            j = cookie[x][i].split("=");
            t[j[0]] = j[1];
        }
        cookie[x] = t;
    }

    return cookie;
}

var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));


request({
    uri: "https://www.google./ncr",
    headers: {
        'User-Agent': 'Mozilla/5.0',
        "Cookie": dataCookieToString(dataCookie)
    }
}, function(error, response, body) {

    request({
        uri: "https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google",
        headers: {
            'User-Agent': 'Mozilla/5.0'
        }
    }, function(error, response, body) {
        console.log(body);
        var $ = cheerio.load(body);

        $(".kno-fb-ctx").each(function() {
            var link = $(this);
            var text = link.text();

            console.log(text);
        });
    });
});
Share Improve this question edited Jan 2, 2015 at 0:55 Aerodynamika asked Jan 2, 2015 at 0:06 AerodynamikaAerodynamika 8,50318 gold badges93 silver badges150 bronze badges 4
  • I've had success running scrapers for free on heroku, or you can just use heroku or another vm as a http proxy and run the scraper locally but go through the proxy. – bspates Commented Jan 2, 2015 at 1:18
  • You realize that scraping Google like this is against their TOS, right? – John Mueller Commented Jan 3, 2015 at 13:01
  • @JohnMueller No, I don't. I'm not scraping them. I'm just checking how it works in theory. No actual content was obtained. But can you point me to the TOS in question so I could see what happens? – Aerodynamika Commented Jan 3, 2015 at 13:33
  • You can use our library that supports "People also search for" parsing: github./serpapi/google-search-results-nodejs – Hartator Commented Apr 9, 2018 at 21:34
Add a ment  | 

1 Answer 1

Reset to default 3

Here's the solution: it's much easier than I thought.

However, I still have a problem that the body I get does not contain the stuff that only show up when javascript is enabled.

Anybody knows how to modify the code below so it also includes javascript-enabled content into the body?

var request = require('request');
var cheerio = require("cheerio");

request = request.defaults({jar: true});

var options = {
    url: 'http://www.google./ncr',
    headers: {
        'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; rv:1.9.2.16) Gecko/20110319 Firefox/3.6.16'
    }
};

request(options, function () {

    request('https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google', function (error, response, body) {

        var $ = cheerio.load(body);

        $("li").each(function() {
            var link = $(this);
            var text = link.text();

            console.log(text);
        });
    });
});
发布评论

评论列表(0)

  1. 暂无评论