2021年4月3日星期六

How can I disable javascript in pupeeter?

I am trying to disable javascript using pupeeter in a base class made to crawl websites however my script fail to so as it's not disabling javascript when I go to any websites. Here is my code:

// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver  import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";  import { readFileSync } from "fs"  import { helpers } from "./helpers";  import _ from "lodash"    /**   * Base class for all crawler   */  abstract class BaseCrawler {    public static readonly TOR_PATH = process.env.TOR_PATH ?? "";    public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";    public static readonly TORRC_PATH = process.env.TORRC_PATH;    public static headless = false    public readonly browser: Promise<puppeteer.Browser>;    private readonly jsEnabled: boolean;      /**     * get the active page     * @returns null if it couldn't get the active     */    public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {      const browser = await this.browser;      var start = new Date().getTime();      while (new Date().getTime() - start < timeout) {        var pages = await browser.pages();        var arr = [];        for (const p of pages) {          if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {            arr.push(p);          }        }          if (arr.length == 1)          return arr[0];      }      return null;    }      constructor(jsEnabled = false, website = "https://google.com") {      console.log(Browser)            this.browser = puppeteer.launch({        headless: BaseCrawler.headless,        //args: ["--proxy-server=socks5://127.0.0.1:9050"],        userDataDir: "./.headless-data"      });        this.jsEnabled = jsEnabled;      this.browser.then(async (b) => {        b.on("targetcreated", async (e: Target) => {            const page = await e.page();          // set a tor useragent          page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);            // disable script if it's aksed          if (page?.url()) {            // console.log(page.url().search("chrome://"))            if (page.url().search("chrome://") < 0)              page?.setJavaScriptEnabled(this.jsEnabled)          }            page?.on('request', request => {            if (request.resourceType() === 'script')              request.abort();            else              request.continue();          })        })      })        this.browser.then(async (b) => {        b.newPage()        const page = await b.newPage();        page.goto(website);      })    }  }  /** Bland tor window just made to browse tor */  export class TorWindow extends BaseCrawler {  };  

I tried to hook the newPage function however it doesn't work as it's giving me the following error:

PS C:\Users\vince\project\js\darknet-drug-crawler> yarn run browser  yarn run v1.22.10  $ node . --tor-window  undefined  C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:35          const old_newpage = puppeteer_1.default.Browser.prototype.newPage;                                                          ^    TypeError: Cannot read property 'prototype' of undefined      at new BaseCrawler (C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:35:57)      at new TorWindow (C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:98:1)      at Object.<anonymous> (C:\Users\vince\project\js\darknet-drug-crawler\dist\index.js:27:5)      at Module._compile (node:internal/modules/cjs/loader:1092:14)      at Object.Module._extensions..js (node:internal/modules/cjs/loader:1121:10)      at Module.load (node:internal/modules/cjs/loader:972:32)      at Function.Module._load (node:internal/modules/cjs/loader:813:14)      at Function.executeUserEntryPoint [as runMain] (node:internal/modules/run_main:76:12)      at node:internal/main/run_main_module:17:47  error Command failed with exit code 1.  info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.  
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver  import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";  import { readFileSync } from "fs"  import { helpers } from "./helpers";  import _ from "lodash"    /**   * Base class for all crawler   */  abstract class BaseCrawler {    public static readonly TOR_PATH = process.env.TOR_PATH ?? "";    public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";    public static readonly TORRC_PATH = process.env.TORRC_PATH;    public static headless = false    public readonly browser: Promise<puppeteer.Browser>;    private readonly jsEnabled: boolean;      /**     * get the active page     * @returns null if it couldn't get the active     */    public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {      const browser = await this.browser;      var start = new Date().getTime();      while (new Date().getTime() - start < timeout) {        var pages = await browser.pages();        var arr = [];        for (const p of pages) {          if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {            arr.push(p);          }        }          if (arr.length == 1)          return arr[0];      }      return null;    }      constructor(jsEnabled = false, website = "https://check.torproject.org") {      console.log(Browser)      const old_newpage = puppeteer.Browser.prototype.newPage      puppeteer.Browser.prototype.newPage = async () => {        const page = await old_newpage()        page.setJavaScriptEnabled(this.jsEnabled)        return page;      }      this.browser = puppeteer.launch({        headless: BaseCrawler.headless,        args: ["--proxy-server=socks5://127.0.0.1:9050"],        userDataDir: "./.headless-data"      });        this.jsEnabled = jsEnabled;      this.browser.then(async (b) => {        b.on("targetcreated", async (e: Target) => {            const page = await e.page();          // set a tor useragent          page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);            // disable script if it's aksed          if (page?.url()) {            // console.log(page.url().search("chrome://"))            if (page.url().search("chrome://") < 0)              page?.setJavaScriptEnabled(this.jsEnabled)          }            page?.on('request', request => {            if (request.resourceType() === 'script')              request.abort();            else              request.continue();          })        })      })        this.browser.then(async (b) => {        b.newPage()        const page = await b.newPage();        page.goto(website);      })    }  }  /** Bland tor window just made to browse tor */  export class TorWindow extends BaseCrawler {  };  
https://stackoverflow.com/questions/66937676/how-can-i-disable-javascript-in-pupeeter April 04, 2021 at 11:04AM

没有评论:

发表评论