Update
This commit is contained in:
@ -8,6 +8,20 @@ export interface ExtractedContent {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface RetryOptions {
|
||||
maxRetries: number;
|
||||
baseDelay: number;
|
||||
maxDelay: number;
|
||||
backoffMultiplier: number;
|
||||
}
|
||||
|
||||
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
|
||||
maxRetries: 3,
|
||||
baseDelay: 1000,
|
||||
maxDelay: 10000,
|
||||
backoffMultiplier: 2
|
||||
};
|
||||
|
||||
// Singleton browser instance for reuse
|
||||
let sharedBrowser: Browser | null = null;
|
||||
|
||||
@ -146,12 +160,74 @@ async function getBrowser(): Promise<Browser> {
|
||||
"--disable-gpu",
|
||||
"--disable-web-security",
|
||||
"--disable-features=VizDisplayCompositor",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-field-trial-config",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--enable-automation",
|
||||
"--force-device-scale-factor=1",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-ssl-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
"--allow-running-insecure-content",
|
||||
"--disable-extensions",
|
||||
"--no-default-browser-check",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--metrics-recording-only",
|
||||
"--no-pings",
|
||||
"--mute-audio"
|
||||
],
|
||||
});
|
||||
}
|
||||
return sharedBrowser;
|
||||
}
|
||||
|
||||
// Helper function for exponential backoff retry
|
||||
async function retryWithBackoff<T>(
|
||||
operation: () => Promise<T>,
|
||||
options: RetryOptions = DEFAULT_RETRY_OPTIONS,
|
||||
attempt: number = 1
|
||||
): Promise<T> {
|
||||
try {
|
||||
return await operation();
|
||||
} catch (error) {
|
||||
if (attempt >= options.maxRetries) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const isRetryableError = error instanceof Error && (
|
||||
error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
|
||||
error.message.includes('ERR_CONNECTION_REFUSED') ||
|
||||
error.message.includes('ERR_CONNECTION_RESET') ||
|
||||
error.message.includes('ERR_NETWORK_CHANGED') ||
|
||||
error.message.includes('ERR_INTERNET_DISCONNECTED') ||
|
||||
error.message.includes('ERR_NAME_NOT_RESOLVED') ||
|
||||
error.message.includes('ERR_TIMED_OUT') ||
|
||||
error.message.includes('Protocol error') ||
|
||||
error.message.includes('Navigation timeout') ||
|
||||
error.message.includes('net::') ||
|
||||
error.message.includes('Target closed') ||
|
||||
error.message.includes('Session closed')
|
||||
);
|
||||
|
||||
if (!isRetryableError) {
|
||||
throw error;
|
||||
}
|
||||
|
||||
const delay = Math.min(
|
||||
options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
|
||||
options.maxDelay
|
||||
);
|
||||
|
||||
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
|
||||
await waitForTimeout(delay);
|
||||
|
||||
return retryWithBackoff(operation, options, attempt + 1);
|
||||
}
|
||||
}
|
||||
|
||||
export async function closeBrowser(): Promise<void> {
|
||||
if (sharedBrowser && sharedBrowser.isConnected()) {
|
||||
await sharedBrowser.close();
|
||||
@ -159,49 +235,79 @@ export async function closeBrowser(): Promise<void> {
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractArticleContent(
|
||||
url: string,
|
||||
): Promise<ExtractedContent> {
|
||||
console.log(`Starting content extraction for: ${url}`);
|
||||
let page = null;
|
||||
try {
|
||||
const browser = await getBrowser();
|
||||
page = await browser.newPage();
|
||||
async function extractWithRetry(url: string): Promise<ExtractedContent> {
|
||||
const userAgents = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
|
||||
];
|
||||
|
||||
// Set user agent and viewport
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
);
|
||||
await page.setViewport({ width: 1280, height: 720 });
|
||||
return retryWithBackoff(async () => {
|
||||
let page = null;
|
||||
try {
|
||||
const browser = await getBrowser();
|
||||
page = await browser.newPage();
|
||||
|
||||
// Set navigation timeout and disable images for faster loading
|
||||
page.setDefaultNavigationTimeout(45000);
|
||||
page.setDefaultTimeout(45000);
|
||||
|
||||
// Block unnecessary resources to speed up loading
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (req) => {
|
||||
const resourceType = req.resourceType();
|
||||
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
// Randomize user agent to avoid detection
|
||||
const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
|
||||
await page.setUserAgent(userAgent!);
|
||||
await page.setViewport({ width: 1280, height: 720 });
|
||||
|
||||
// Set longer timeout for problematic sites
|
||||
page.setDefaultNavigationTimeout(60000);
|
||||
page.setDefaultTimeout(60000);
|
||||
|
||||
// Block unnecessary resources to speed up loading
|
||||
await page.setRequestInterception(true);
|
||||
page.on('request', (req) => {
|
||||
const resourceType = req.resourceType();
|
||||
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
|
||||
req.abort();
|
||||
} else {
|
||||
req.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Add extra headers to appear more like a real browser
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
});
|
||||
|
||||
// Navigate with multiple wait strategies
|
||||
let response;
|
||||
try {
|
||||
response = await page.goto(url, {
|
||||
waitUntil: ["domcontentloaded", "networkidle0"],
|
||||
timeout: 60000,
|
||||
});
|
||||
} catch (networkError) {
|
||||
// Fallback to more basic wait strategy
|
||||
const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
|
||||
console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
|
||||
response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 60000,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Navigate to the page with better waiting strategy
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: 45000,
|
||||
});
|
||||
if (!response) {
|
||||
throw new Error('No response received from server');
|
||||
}
|
||||
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
||||
}
|
||||
const status = response?.status();
|
||||
if (status && status >= 400) {
|
||||
throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
|
||||
}
|
||||
|
||||
// Enhanced dynamic content handling
|
||||
console.log('Handling dynamic content...');
|
||||
await handleDynamicContent(page);
|
||||
// Enhanced dynamic content handling
|
||||
console.log('Handling dynamic content...');
|
||||
await handleDynamicContent(page);
|
||||
|
||||
// Extract content using advanced multi-strategy approach
|
||||
console.log('Extracting content using multi-strategy approach...');
|
||||
@ -420,13 +526,13 @@ export async function extractArticleContent(
|
||||
|
||||
console.log(`Found ${candidates.length} content candidates`);
|
||||
if (candidates.length > 0) {
|
||||
console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
|
||||
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
|
||||
}
|
||||
|
||||
// Get the best content
|
||||
let content = "";
|
||||
if (candidates.length > 0) {
|
||||
content = candidates[0].content;
|
||||
content = candidates[0]!.content;
|
||||
|
||||
// If the best candidate is still short, try combining top candidates
|
||||
if (content.length < 500 && candidates.length > 1) {
|
||||
@ -513,9 +619,9 @@ export async function extractArticleContent(
|
||||
extractedData.content = fallbackData;
|
||||
} else {
|
||||
return {
|
||||
title: extractedData.title,
|
||||
title: extractedData.title || '',
|
||||
content: extractedData.content || "",
|
||||
description: extractedData.description,
|
||||
description: extractedData.description || '',
|
||||
success: false,
|
||||
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
|
||||
};
|
||||
@ -529,26 +635,65 @@ export async function extractArticleContent(
|
||||
content = content.substring(0, maxLength) + "...";
|
||||
}
|
||||
|
||||
console.log(`Successfully extracted content: ${content.length} characters`);
|
||||
return {
|
||||
title: extractedData.title,
|
||||
content,
|
||||
description: extractedData.description,
|
||||
success: true,
|
||||
};
|
||||
console.log(`Successfully extracted content: ${content.length} characters`);
|
||||
return {
|
||||
title: extractedData.title,
|
||||
content,
|
||||
description: extractedData.description,
|
||||
success: true,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(`Content extraction attempt failed:`, error);
|
||||
throw error; // Let retry logic handle this
|
||||
} finally {
|
||||
if (page) {
|
||||
try {
|
||||
await page.close();
|
||||
} catch (closeError) {
|
||||
console.warn('Failed to close page:', closeError);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export async function extractArticleContent(
|
||||
url: string,
|
||||
): Promise<ExtractedContent> {
|
||||
console.log(`Starting content extraction for: ${url}`);
|
||||
|
||||
try {
|
||||
return await extractWithRetry(url);
|
||||
} catch (error) {
|
||||
console.error(`Content extraction failed for ${url}:`, error);
|
||||
console.error(`Content extraction failed after all retries for ${url}:`, error);
|
||||
|
||||
// Provide more specific error messages
|
||||
let errorMessage = "Unknown error occurred";
|
||||
if (error instanceof Error) {
|
||||
if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
|
||||
errorMessage = "Network connection failed - server may be unreachable";
|
||||
} else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
|
||||
errorMessage = "Connection refused by server";
|
||||
} else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
|
||||
errorMessage = "DNS resolution failed - domain may not exist";
|
||||
} else if (error.message.includes('ERR_TIMED_OUT')) {
|
||||
errorMessage = "Request timed out - server too slow";
|
||||
} else if (error.message.includes('HTTP 4')) {
|
||||
errorMessage = `Client error: ${error.message}`;
|
||||
} else if (error.message.includes('HTTP 5')) {
|
||||
errorMessage = `Server error: ${error.message}`;
|
||||
} else {
|
||||
errorMessage = error.message;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
title: "",
|
||||
content: "",
|
||||
description: "",
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error occurred",
|
||||
error: errorMessage,
|
||||
};
|
||||
} finally {
|
||||
if (page) {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user