I am trying to disable javascript using pupeeter in a base class made to crawl websites however my script fail to so as it's not disabling javascript when I go to any websites. Here is my code:
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer"; import { readFileSync } from "fs" import { helpers } from "./helpers"; import _ from "lodash" /** * Base class for all crawler */ abstract class BaseCrawler { public static readonly TOR_PATH = process.env.TOR_PATH ?? ""; public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? ""; public static readonly TORRC_PATH = process.env.TORRC_PATH; public static headless = false public readonly browser: Promise<puppeteer.Browser>; private readonly jsEnabled: boolean; /** * get the active page * @returns null if it couldn't get the active */ public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> { const browser = await this.browser; var start = new Date().getTime(); while (new Date().getTime() - start < timeout) { var pages = await browser.pages(); var arr = []; for (const p of pages) { if (await p.evaluate(() => { return document.visibilityState == 'visible' })) { arr.push(p); } } if (arr.length == 1) return arr[0]; } return null; } constructor(jsEnabled = false, website = "https://google.com") { console.log(Browser) this.browser = puppeteer.launch({ headless: BaseCrawler.headless, //args: ["--proxy-server=socks5://127.0.0.1:9050"], userDataDir: "./.headless-data" }); this.jsEnabled = jsEnabled; this.browser.then(async (b) => { b.on("targetcreated", async (e: Target) => { const page = await e.page(); // set a tor useragent page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`); // disable script if it's aksed if (page?.url()) { // console.log(page.url().search("chrome://")) if (page.url().search("chrome://") < 0) page?.setJavaScriptEnabled(this.jsEnabled) } page?.on('request', request => { if (request.resourceType() === 'script') request.abort(); else request.continue(); }) }) }) this.browser.then(async (b) => { b.newPage() const page = await b.newPage(); page.goto(website); }) } } /** Bland tor window just made to browse tor */ export class TorWindow extends BaseCrawler { };
I tried to hook the newPage
function however it doesn't work as it's giving me the following error:
PS C:\Users\vince\project\js\darknet-drug-crawler> yarn run browser yarn run v1.22.10 $ node . --tor-window undefined C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:35 const old_newpage = puppeteer_1.default.Browser.prototype.newPage; ^ TypeError: Cannot read property 'prototype' of undefined at new BaseCrawler (C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:35:57) at new TorWindow (C:\Users\vince\project\js\darknet-drug-crawler\dist\Crawler.js:98:1) at Object.<anonymous> (C:\Users\vince\project\js\darknet-drug-crawler\dist\index.js:27:5) at Module._compile (node:internal/modules/cjs/loader:1092:14) at Object.Module._extensions..js (node:internal/modules/cjs/loader:1121:10) at Module.load (node:internal/modules/cjs/loader:972:32) at Function.Module._load (node:internal/modules/cjs/loader:813:14) at Function.executeUserEntryPoint [as runMain] (node:internal/modules/run_main:76:12) at node:internal/main/run_main_module:17:47 error Command failed with exit code 1. info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer"; import { readFileSync } from "fs" import { helpers } from "./helpers"; import _ from "lodash" /** * Base class for all crawler */ abstract class BaseCrawler { public static readonly TOR_PATH = process.env.TOR_PATH ?? ""; public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? ""; public static readonly TORRC_PATH = process.env.TORRC_PATH; public static headless = false public readonly browser: Promise<puppeteer.Browser>; private readonly jsEnabled: boolean; /** * get the active page * @returns null if it couldn't get the active */ public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> { const browser = await this.browser; var start = new Date().getTime(); while (new Date().getTime() - start < timeout) { var pages = await browser.pages(); var arr = []; for (const p of pages) { if (await p.evaluate(() => { return document.visibilityState == 'visible' })) { arr.push(p); } } if (arr.length == 1) return arr[0]; } return null; } constructor(jsEnabled = false, website = "https://check.torproject.org") { console.log(Browser) const old_newpage = puppeteer.Browser.prototype.newPage puppeteer.Browser.prototype.newPage = async () => { const page = await old_newpage() page.setJavaScriptEnabled(this.jsEnabled) return page; } this.browser = puppeteer.launch({ headless: BaseCrawler.headless, args: ["--proxy-server=socks5://127.0.0.1:9050"], userDataDir: "./.headless-data" }); this.jsEnabled = jsEnabled; this.browser.then(async (b) => { b.on("targetcreated", async (e: Target) => { const page = await e.page(); // set a tor useragent page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`); // disable script if it's aksed if (page?.url()) { // console.log(page.url().search("chrome://")) if (page.url().search("chrome://") < 0) page?.setJavaScriptEnabled(this.jsEnabled) } page?.on('request', request => { if (request.resourceType() === 'script') request.abort(); else request.continue(); }) }) }) this.browser.then(async (b) => { b.newPage() const page = await b.newPage(); page.goto(website); }) } } /** Bland tor window just made to browse tor */ export class TorWindow extends BaseCrawler { };
https://stackoverflow.com/questions/66937676/how-can-i-disable-javascript-in-pupeeter April 04, 2021 at 11:04AM
没有评论:
发表评论