最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

web scraping - Avoid instant block by Chrome when using Puppeteer - Stack Overflow

programmeradmin0浏览0评论

I'm coding a Puppeteer script, which starts up an instance of Google Chrome for Testing. Even if this browser is not headless, and even if I control it manually without any automation, whenever I try to do a Google search, I get blocked as a bot immediately. Going directly to website URLs works; it's Google that's blocking me from doing any searches. This instant block has persisted for days. However, I'm not using any proxies, and I'm still allowed to do searches (without CAPTCHAs) with my IP on my normal Google Chrome browser.

Puppeteer is using the Stealth plugin, params to avoid bot detection, a fake user agent, etc. Yet, Chrome instantly detects me as a bot. How can I avoid this, and why is this happening? How is Chrome tracking/detecting me?

Here is my code to define the stealth settings:

const os = require("os");

async function initializeStealthBrowser(page) {
  async function generateChromeUserAgent() {
    // Latest Chrome versions for different platforms
    const CHROME_VERSIONS = {
      win: ["134.0.6998.35", "134.0.6998.36"],
      mac: ["134.0.6998.44", "134.0.6998.45"],
      linux: ["134.0.6998.35"],
    };

    let platform = os.platform();
    platform === "win32"
      ? (platform = "win")
      : platform === "darwin"
      ? (platform = "mac")
      : (platform = "linux");
    const version =
      CHROME_VERSIONS[platform][
        Math.floor(Math.random() * CHROME_VERSIONS[platform].length)
      ];

    switch (platform) {
      case "win":
        return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      case "mac":
        return `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      case "linux":
        return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      default:
        return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
    }
  }

  async function generateRealisticHTTPHeaders(userAgent) {
    return (headers = {
      accept:
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
      "accept-language": "en-US,en;q=0.9",
      "accept-encoding": "gzip, deflate, br",
      "cache-control": "max-age=0",
      "sec-ch-ua":
        '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
      "sec-ch-ua-mobile": "?0",
      "sec-ch-ua-platform": '"macOS"',
      "sec-fetch-dest": "document",
      "sec-fetch-mode": "navigate",
      "sec-fetch-site": "none",
      "sec-fetch-user": "?1",
      "upgrade-insecure-requests": "1",
      "user-agent": userAgent,
    });
  }

  async function injectFakeHistory(page) {
    try {
      await page.evaluate(() => {
        // Create a fake history array to store navigation entries
        const fakeHistoryEntries = [];

        // Generate some realistic history entries - all on the same origin as current page
        const currentOrigin = window.location.origin;
        const pathOptions = [
          "/search",
          "/news",
          "/maps",
          "/images",
          "/mail",
          "/drive",
          "/calendar",
          "/shopping",
        ];

        // Add 3-8 fake history entries on the same origin
        const entryCount = Math.floor(Math.random() * 6) + 3;
        for (let i = 0; i < entryCount; i++) {
          const randomPath =
            pathOptions[Math.floor(Math.random() * pathOptions.length)];
          fakeHistoryEntries.push({
            url: `${currentOrigin}${randomPath}`,
            title: `Page ${randomPath.substring(1)}`,
            state: {},
          });
        }

        // Store the original history methods
        const originalPushState = window.history.pushState;
        const originalReplaceState = window.history.replaceState;
        const originalBack = window.history.back;

        // Override history length property to return a more realistic value
        Object.defineProperty(window.history, "length", {
          get: function () {
            return fakeHistoryEntries.length;
          },
        });

        // Replace history methods with our own versions
        window.history.pushState = function (state, title, url) {
          // Call original but catch and ignore security errors
          try {
            originalPushState.call(window.history, state, title, url);
          } catch (e) {
            // Silently fail on security errors
          }

          // Add to our fake history regardless
          fakeHistoryEntries.push({
            url: url || window.location.href,
            title: title,
            state: state,
          });
        };

        window.history.replaceState = function (state, title, url) {
          // Call original but catch and ignore security errors
          try {
            originalReplaceState.call(window.history, state, title, url);
          } catch (e) {
            // Silently fail on security errors
          }

          // Replace last entry in fake history
          if (fakeHistoryEntries.length > 0) {
            fakeHistoryEntries[fakeHistoryEntries.length - 1] = {
              url: url || window.location.href,
              title: title,
              state: state,
            };
          }
        };

        // Make it seem like we have history to go back to
        window.history.back = function () {
          // Call the original
          originalBack.call(window.history);

          // Also manage our fake history
          if (fakeHistoryEntries.length > 1) {
            fakeHistoryEntries.pop();
          }
        };

        // Set initial history state
        for (const entry of fakeHistoryEntries) {
          try {
            originalPushState.call(
              window.history,
              entry.state,
              entry.title,
              entry.url
            );
          } catch (e) {
            // Ignore security errors
          }
        }

        // Make the history property appear realistic
        console.log(
          `Successfully set up fake browser history with ${fakeHistoryEntries.length} entries`
        );
      });
    } catch (error) {
      console.warn("Error setting up fake history:", error);
      // Continue execution even if history injection fails
    }
  }

  try {
    // Ensure the generated user agent is always a valid string
    let userAgent = await generateChromeUserAgent();

    // Verify the user agent is a valid string without any non-string characters
    if (typeof userAgent !== "string" || userAgent.includes("\u0000")) {
      console.warn("Generated an invalid user agent, falling back to default");
      userAgent =
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36";
    }

    await page.setUserAgent(userAgent);
    await page.setExtraHTTPHeaders(generateRealisticHTTPHeaders(userAgent));
    await injectFakeHistory(page);
    await page.setViewport({
      width: 1920,
      height: 1080,
      deviceScaleFactor: 1,
    });
    await page.setJavaScriptEnabled(true);
  } catch (error) {
    console.error("Error initializing stealth browser:", error);
    throw error;
  }
}

module.exports = { initializeStealthBrowser };

And here's the code for the first initialization:

const browser = await puppeteerExtra.launch({
      headless: false,
      userDataDir: userDataDir,
      referer: ";,
      args: [
        "--disable-features=site-per-process",
        "--disable-advertisements",
        "--enable-javascript",
        "--disable-blink-features=AutomationControlled",
        "--no-sandbox",
        "--disable-gpu",
        "--enable-webgl",
      ],
    });

    let page = await browser.newPage();

I'm coding a Puppeteer script, which starts up an instance of Google Chrome for Testing. Even if this browser is not headless, and even if I control it manually without any automation, whenever I try to do a Google search, I get blocked as a bot immediately. Going directly to website URLs works; it's Google that's blocking me from doing any searches. This instant block has persisted for days. However, I'm not using any proxies, and I'm still allowed to do searches (without CAPTCHAs) with my IP on my normal Google Chrome browser.

Puppeteer is using the Stealth plugin, params to avoid bot detection, a fake user agent, etc. Yet, Chrome instantly detects me as a bot. How can I avoid this, and why is this happening? How is Chrome tracking/detecting me?

Here is my code to define the stealth settings:

const os = require("os");

async function initializeStealthBrowser(page) {
  async function generateChromeUserAgent() {
    // Latest Chrome versions for different platforms
    const CHROME_VERSIONS = {
      win: ["134.0.6998.35", "134.0.6998.36"],
      mac: ["134.0.6998.44", "134.0.6998.45"],
      linux: ["134.0.6998.35"],
    };

    let platform = os.platform();
    platform === "win32"
      ? (platform = "win")
      : platform === "darwin"
      ? (platform = "mac")
      : (platform = "linux");
    const version =
      CHROME_VERSIONS[platform][
        Math.floor(Math.random() * CHROME_VERSIONS[platform].length)
      ];

    switch (platform) {
      case "win":
        return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      case "mac":
        return `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      case "linux":
        return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
      default:
        return `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
    }
  }

  async function generateRealisticHTTPHeaders(userAgent) {
    return (headers = {
      accept:
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
      "accept-language": "en-US,en;q=0.9",
      "accept-encoding": "gzip, deflate, br",
      "cache-control": "max-age=0",
      "sec-ch-ua":
        '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
      "sec-ch-ua-mobile": "?0",
      "sec-ch-ua-platform": '"macOS"',
      "sec-fetch-dest": "document",
      "sec-fetch-mode": "navigate",
      "sec-fetch-site": "none",
      "sec-fetch-user": "?1",
      "upgrade-insecure-requests": "1",
      "user-agent": userAgent,
    });
  }

  async function injectFakeHistory(page) {
    try {
      await page.evaluate(() => {
        // Create a fake history array to store navigation entries
        const fakeHistoryEntries = [];

        // Generate some realistic history entries - all on the same origin as current page
        const currentOrigin = window.location.origin;
        const pathOptions = [
          "/search",
          "/news",
          "/maps",
          "/images",
          "/mail",
          "/drive",
          "/calendar",
          "/shopping",
        ];

        // Add 3-8 fake history entries on the same origin
        const entryCount = Math.floor(Math.random() * 6) + 3;
        for (let i = 0; i < entryCount; i++) {
          const randomPath =
            pathOptions[Math.floor(Math.random() * pathOptions.length)];
          fakeHistoryEntries.push({
            url: `${currentOrigin}${randomPath}`,
            title: `Page ${randomPath.substring(1)}`,
            state: {},
          });
        }

        // Store the original history methods
        const originalPushState = window.history.pushState;
        const originalReplaceState = window.history.replaceState;
        const originalBack = window.history.back;

        // Override history length property to return a more realistic value
        Object.defineProperty(window.history, "length", {
          get: function () {
            return fakeHistoryEntries.length;
          },
        });

        // Replace history methods with our own versions
        window.history.pushState = function (state, title, url) {
          // Call original but catch and ignore security errors
          try {
            originalPushState.call(window.history, state, title, url);
          } catch (e) {
            // Silently fail on security errors
          }

          // Add to our fake history regardless
          fakeHistoryEntries.push({
            url: url || window.location.href,
            title: title,
            state: state,
          });
        };

        window.history.replaceState = function (state, title, url) {
          // Call original but catch and ignore security errors
          try {
            originalReplaceState.call(window.history, state, title, url);
          } catch (e) {
            // Silently fail on security errors
          }

          // Replace last entry in fake history
          if (fakeHistoryEntries.length > 0) {
            fakeHistoryEntries[fakeHistoryEntries.length - 1] = {
              url: url || window.location.href,
              title: title,
              state: state,
            };
          }
        };

        // Make it seem like we have history to go back to
        window.history.back = function () {
          // Call the original
          originalBack.call(window.history);

          // Also manage our fake history
          if (fakeHistoryEntries.length > 1) {
            fakeHistoryEntries.pop();
          }
        };

        // Set initial history state
        for (const entry of fakeHistoryEntries) {
          try {
            originalPushState.call(
              window.history,
              entry.state,
              entry.title,
              entry.url
            );
          } catch (e) {
            // Ignore security errors
          }
        }

        // Make the history property appear realistic
        console.log(
          `Successfully set up fake browser history with ${fakeHistoryEntries.length} entries`
        );
      });
    } catch (error) {
      console.warn("Error setting up fake history:", error);
      // Continue execution even if history injection fails
    }
  }

  try {
    // Ensure the generated user agent is always a valid string
    let userAgent = await generateChromeUserAgent();

    // Verify the user agent is a valid string without any non-string characters
    if (typeof userAgent !== "string" || userAgent.includes("\u0000")) {
      console.warn("Generated an invalid user agent, falling back to default");
      userAgent =
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36";
    }

    await page.setUserAgent(userAgent);
    await page.setExtraHTTPHeaders(generateRealisticHTTPHeaders(userAgent));
    await injectFakeHistory(page);
    await page.setViewport({
      width: 1920,
      height: 1080,
      deviceScaleFactor: 1,
    });
    await page.setJavaScriptEnabled(true);
  } catch (error) {
    console.error("Error initializing stealth browser:", error);
    throw error;
  }
}

module.exports = { initializeStealthBrowser };

And here's the code for the first initialization:

const browser = await puppeteerExtra.launch({
      headless: false,
      userDataDir: userDataDir,
      referer: "https://www.google",
      args: [
        "--disable-features=site-per-process",
        "--disable-advertisements",
        "--enable-javascript",
        "--disable-blink-features=AutomationControlled",
        "--no-sandbox",
        "--disable-gpu",
        "--enable-webgl",
      ],
    });

    let page = await browser.newPage();
Share Improve this question edited Mar 13 at 3:54 ggorlen 58k8 gold badges114 silver badges157 bronze badges asked Mar 12 at 2:56 NateNate 135 bronze badges 3
  • This is rather broad. Different sites use different techniques for blocking bots, so I suggest sharing the exact site, as well as your minimal code and your high-level goal you want to achieve on the site. I've had no problem automating google searches--the bot blocking is quite non-aggressive as far as I've seen. There are also search APIs you can use. Thanks. – ggorlen Commented Mar 12 at 13:20
  • Hi, I edited the question to attach the code. The site I'm trying to bypass right now is google for searches. My goal is to not get captchas in the first place, but for google and other websites (including tripadvisor), I get a captcha immediately, even if I don't automate anything -- just by starting the browser. There are other sites I can use, but I want to fix this block so it doesn't happen anywhere. – Nate Commented Mar 12 at 21:58
  • Your ip is blacklisted at this point so you would need to log in or solve a captcha – pguardiario Commented Mar 16 at 23:27
Add a comment  | 

1 Answer 1

Reset to default 0

If it's for personal use then you could export your daily browser cookie & spoof it using puppeteer.

If it's for commercial use than look into antidetect browsers. They offer pre-made cookies

发布评论

评论列表(0)

  1. 暂无评论