Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions lib/FredyPipelineExecutioner.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,15 @@ class FredyPipelineExecutioner {
* @param {string} providerId The ID of the provider currently in use.
* @param {string} jobKey Key of the job that is currently running (from within the config).
* @param {SimilarityCache} similarityCache Cache instance for checking similar entries.
* @param browser
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache, browser) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
this._browser = browser;
}

/**
Expand Down Expand Up @@ -119,7 +121,7 @@ class FredyPipelineExecutioner {
* @returns {Promise<Listing[]>} Resolves with an array of listings (empty when none found).
*/
_getListings(url) {
const extractor = new Extractor();
const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser });
return new Promise((resolve, reject) => {
extractor
.execute(url, this._providerConfig.waitForSelector)
Expand Down
53 changes: 0 additions & 53 deletions lib/provider/immonet.js

This file was deleted.

119 changes: 68 additions & 51 deletions lib/services/extractor/puppeteerExtractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,52 +19,80 @@ import path from 'path';

puppeteer.use(StealthPlugin());

export async function launchBrowser(url, options) {
const preCfg = getPreLaunchConfig(url, options || {});
const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
preCfg.langArg,
preCfg.windowSizeArg,
...preCfg.extraArgs,
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}

let userDataDir;
let removeUserDataDir = false;
if (options && options.userDataDir) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be simplified:
options?.userDataDir

userDataDir = options.userDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
}

const browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
args: launchArgs,
timeout: options?.puppeteerTimeout || 30_000,
userDataDir,
executablePath: options?.executablePath,
});

browser.__fredy_userDataDir = userDataDir;
browser.__fredy_removeUserDataDir = removeUserDataDir;

return browser;
}

export async function closeBrowser(browser) {
if (!browser) return;
const userDataDir = browser.__fredy_userDataDir;
const removeUserDataDir = browser.__fredy_removeUserDataDir;
try {
await browser.close();
} catch {
// ignore
}
if (removeUserDataDir && userDataDir) {
try {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
} catch {
// ignore
}
}
}

export default async function execute(url, waitForSelector, options) {
let browser;
let browser = options?.browser;
let isExternalBrowser = !!browser;
let page;
let result;
let userDataDir;
let removeUserDataDir = false;
try {
debug(`Sending request to ${url} using Puppeteer.`);

// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
removeUserDataDir = !!options.cleanupUserDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
if (!isExternalBrowser) {
browser = await launchBrowser(url, options);
}

const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
// Prepare bot prevention pre-launch config
const preCfg = getPreLaunchConfig(url, options || {});
launchArgs.push(preCfg.langArg);
launchArgs.push(preCfg.windowSizeArg);
launchArgs.push(...preCfg.extraArgs);

browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
args: launchArgs,
timeout: options?.puppeteerTimeout || 30_000,
userDataDir,
executablePath: options?.executablePath, // allow using system Chrome
});

page = await browser.newPage();
const preCfg = getPreLaunchConfig(url, options || {});
await applyBotPreventionToPage(page, preCfg);
// Provide languages value before navigation
await applyLanguagePersistence(page, preCfg);
Expand Down Expand Up @@ -104,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
result = pageSource || (await page.content());
}
} catch (error) {
if (error?.message?.includes('Timeout')) {
if (error?.name?.includes('Timeout')) {
logger.debug('Error executing with puppeteer executor', error);
} else {
logger.warn('Error executing with puppeteer executor', error);
Expand All @@ -118,19 +146,8 @@ export default async function execute(url, waitForSelector, options) {
} catch {
// ignore
}
try {
if (browser != null) {
await browser.close();
}
} catch {
// ignore
}
try {
if (removeUserDataDir && userDataDir) {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
}
} catch {
// ignore
if (browser != null && !isExternalBrowser) {
await closeBrowser(browser);
}
}
return result;
Expand Down
50 changes: 31 additions & 19 deletions lib/services/jobs/jobExecutionService.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import FredyPipelineExecutioner from '../../FredyPipelineExecutioner.js';
import * as similarityCache from '../similarity-check/similarityCache.js';
import { isRunning, markFinished, markRunning } from './run-state.js';
import { sendToUsers } from '../sse/sse-broker.js';
import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js';

/**
* Initializes the job execution service.
Expand Down Expand Up @@ -94,7 +95,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
* @param {{userId?: string, isAdmin?: boolean}} [context] - Who requested the run; determines job filtering.
* @returns {void}
*/
function runAll(respectWorkingHours = true, context = undefined) {
async function runAll(respectWorkingHours = true, context = undefined) {
if (settings.demoMode) return;
const now = Date.now();
const withinHours = duringWorkingHoursOrNotSet(settings, now);
Expand All @@ -103,15 +104,18 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
return;
}
settings.lastRun = now;
jobStorage
const jobs = jobStorage
.getJobs()
.filter((job) => job.enabled)
.filter((job) => {
if (!context) return true; // startup/cron → all
if (context.isAdmin) return true; // admin → all
return context.userId ? job.userId === context.userId : false; // user → own
})
.forEach((job) => executeJob(job));
});

for (const job of jobs) {
await executeJob(job);
}
}

/**
Expand Down Expand Up @@ -154,28 +158,36 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
} catch (err) {
logger.warn('Failed to emit start status for job', job.id, err);
}
let browser;
try {
const jobProviders = job.provider.filter(
(p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null,
);
const executions = jobProviders.map(async (prov) => {
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
matchedProvider.init(prov, job.blacklist);
await new FredyPipelineExecutioner(
matchedProvider.config,
job.notificationAdapter,
prov.id,
job.id,
similarityCache,
).execute();
});
const results = await Promise.allSettled(executions);
for (const r of results) {
if (r.status === 'rejected') {
logger.error(r.reason);
for (const prov of jobProviders) {
try {
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
matchedProvider.init(prov, job.blacklist);

if (!browser && matchedProvider.config.getListings == null) {
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve Puppeteer launch options for shared browser

executeJob now creates one shared browser via launchBrowser(matchedProvider.config.url, {}), which drops the options path that _getListings now forwards to Extractor (including DEFAULT_OPTIONS.puppeteerTimeout = 60000 in extractor.js). Because launchBrowser falls back to a 30s launch timeout, slower hosts/containers can time out during Chromium startup and then all Puppeteer-backed providers in that job fail to scrape. Pass provider/extractor Puppeteer options into this launch call so shared-browser runs keep prior timeout/proxy/executable behavior.

Useful? React with 👍 / 👎.

}

await new FredyPipelineExecutioner(
matchedProvider.config,
job.notificationAdapter,
prov.id,
job.id,
similarityCache,
browser,
).execute();
} catch (err) {
logger.error(err);
}
}
} finally {
if (browser) {
await puppeteerExtractor.closeBrowser(browser);
}
markFinished(job.id);
try {
bus.emit('jobs:status', { jobId: job.id, running: false });
Expand Down
Loading