For my Node.Js app I need to get the first page of Google search results but from the domain because I need the
"People also search for"
knowledge graph info, which only shows up on Google.Com.
I figured I can use the request
and cheerio
modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. ;site=&source=hp&q=google&oq=google
Google automatically redirects me to the .de
domain (as I'm based in Germany).
I tried setting it to first load url which automatically switches off country-specific redirect in browsers, but it didn't work...
Does anybody know what I could do differently to make it work?
Here's my code... Thank you!
var request = require("request");
var cheerio = require("cheerio");
function dataCookieToString(dataCookie) {
var t = "";
for (var x = 0; x < dataCookie.length; x++) {
t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
}
return t;
}
function mkdataCookie(cookie) {
var t, j;
cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
for (var x = 0; x < cookie.length; x++) {
cookie[x] = cookie[x].split("; ");
j = cookie[x][0].split("=");
t = {
key: j[0],
value: j[1]
};
for (var i = 1; i < cookie[x].length; i++) {
j = cookie[x][i].split("=");
t[j[0]] = j[1];
}
cookie[x] = t;
}
return cookie;
}
var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));
request({
uri: "",
headers: {
'User-Agent': 'Mozilla/5.0',
"Cookie": dataCookieToString(dataCookie)
}
}, function(error, response, body) {
request({
uri: ";site=&source=hp&q=google&oq=google",
headers: {
'User-Agent': 'Mozilla/5.0'
}
}, function(error, response, body) {
console.log(body);
var $ = cheerio.load(body);
$(".kno-fb-ctx").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});
For my Node.Js app I need to get the first page of Google search results but from the .
domain because I need the "People also search for"
knowledge graph info, which only shows up on Google.Com.
I figured I can use the request
and cheerio
modules to scrap content from Google's search results page, but when I try to access the URL I need, i.e. https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google
Google automatically redirects me to the .de
domain (as I'm based in Germany).
I tried setting it to first load http://www.google./ncr
url which automatically switches off country-specific redirect in browsers, but it didn't work...
Does anybody know what I could do differently to make it work?
Here's my code... Thank you!
var request = require("request");
var cheerio = require("cheerio");
function dataCookieToString(dataCookie) {
var t = "";
for (var x = 0; x < dataCookie.length; x++) {
t += ((t != "") ? "; " : "") + dataCookie[x].key + "=" + dataCookie[x].value;
}
return t;
}
function mkdataCookie(cookie) {
var t, j;
cookie = cookie.toString().replace(/,([^ ])/g, ",[12],$1").split(",[12],");
for (var x = 0; x < cookie.length; x++) {
cookie[x] = cookie[x].split("; ");
j = cookie[x][0].split("=");
t = {
key: j[0],
value: j[1]
};
for (var i = 1; i < cookie[x].length; i++) {
j = cookie[x][i].split("=");
t[j[0]] = j[1];
}
cookie[x] = t;
}
return cookie;
}
var dataCookie = mkdataCookie('MC_STORE_ID=66860; expires=' + new Date(new Date().getTime() + 86409000));
request({
uri: "https://www.google./ncr",
headers: {
'User-Agent': 'Mozilla/5.0',
"Cookie": dataCookieToString(dataCookie)
}
}, function(error, response, body) {
request({
uri: "https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google",
headers: {
'User-Agent': 'Mozilla/5.0'
}
}, function(error, response, body) {
console.log(body);
var $ = cheerio.load(body);
$(".kno-fb-ctx").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});
Share
Improve this question
edited Jan 2, 2015 at 0:55
Aerodynamika
asked Jan 2, 2015 at 0:06
AerodynamikaAerodynamika
8,50318 gold badges93 silver badges150 bronze badges
4
- I've had success running scrapers for free on heroku, or you can just use heroku or another vm as a http proxy and run the scraper locally but go through the proxy. – bspates Commented Jan 2, 2015 at 1:18
- You realize that scraping Google like this is against their TOS, right? – John Mueller Commented Jan 3, 2015 at 13:01
- @JohnMueller No, I don't. I'm not scraping them. I'm just checking how it works in theory. No actual content was obtained. But can you point me to the TOS in question so I could see what happens? – Aerodynamika Commented Jan 3, 2015 at 13:33
- You can use our library that supports "People also search for" parsing: github./serpapi/google-search-results-nodejs – Hartator Commented Apr 9, 2018 at 21:34
1 Answer
Reset to default 3Here's the solution: it's much easier than I thought.
However, I still have a problem that the body
I get does not contain the stuff that only show up when javascript is enabled.
Anybody knows how to modify the code below so it also includes javascript-enabled content into the body?
var request = require('request');
var cheerio = require("cheerio");
request = request.defaults({jar: true});
var options = {
url: 'http://www.google./ncr',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; rv:1.9.2.16) Gecko/20110319 Firefox/3.6.16'
}
};
request(options, function () {
request('https://www.google./search?gws_rd=ssl&site=&source=hp&q=google&oq=google', function (error, response, body) {
var $ = cheerio.load(body);
$("li").each(function() {
var link = $(this);
var text = link.text();
console.log(text);
});
});
});