Update
This commit is contained in:
@ -8,6 +8,20 @@ export interface ExtractedContent {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface RetryOptions {
|
||||||
|
maxRetries: number;
|
||||||
|
baseDelay: number;
|
||||||
|
maxDelay: number;
|
||||||
|
backoffMultiplier: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
|
||||||
|
maxRetries: 3,
|
||||||
|
baseDelay: 1000,
|
||||||
|
maxDelay: 10000,
|
||||||
|
backoffMultiplier: 2
|
||||||
|
};
|
||||||
|
|
||||||
// Singleton browser instance for reuse
|
// Singleton browser instance for reuse
|
||||||
let sharedBrowser: Browser | null = null;
|
let sharedBrowser: Browser | null = null;
|
||||||
|
|
||||||
@ -146,12 +160,74 @@ async function getBrowser(): Promise<Browser> {
|
|||||||
"--disable-gpu",
|
"--disable-gpu",
|
||||||
"--disable-web-security",
|
"--disable-web-security",
|
||||||
"--disable-features=VizDisplayCompositor",
|
"--disable-features=VizDisplayCompositor",
|
||||||
|
"--disable-background-timer-throttling",
|
||||||
|
"--disable-backgrounding-occluded-windows",
|
||||||
|
"--disable-renderer-backgrounding",
|
||||||
|
"--disable-field-trial-config",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
|
"--enable-automation",
|
||||||
|
"--force-device-scale-factor=1",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-ssl-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
"--allow-running-insecure-content",
|
||||||
|
"--disable-extensions",
|
||||||
|
"--no-default-browser-check",
|
||||||
|
"--disable-default-apps",
|
||||||
|
"--disable-sync",
|
||||||
|
"--metrics-recording-only",
|
||||||
|
"--no-pings",
|
||||||
|
"--mute-audio"
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
return sharedBrowser;
|
return sharedBrowser;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper function for exponential backoff retry
|
||||||
|
async function retryWithBackoff<T>(
|
||||||
|
operation: () => Promise<T>,
|
||||||
|
options: RetryOptions = DEFAULT_RETRY_OPTIONS,
|
||||||
|
attempt: number = 1
|
||||||
|
): Promise<T> {
|
||||||
|
try {
|
||||||
|
return await operation();
|
||||||
|
} catch (error) {
|
||||||
|
if (attempt >= options.maxRetries) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const isRetryableError = error instanceof Error && (
|
||||||
|
error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
|
||||||
|
error.message.includes('ERR_CONNECTION_REFUSED') ||
|
||||||
|
error.message.includes('ERR_CONNECTION_RESET') ||
|
||||||
|
error.message.includes('ERR_NETWORK_CHANGED') ||
|
||||||
|
error.message.includes('ERR_INTERNET_DISCONNECTED') ||
|
||||||
|
error.message.includes('ERR_NAME_NOT_RESOLVED') ||
|
||||||
|
error.message.includes('ERR_TIMED_OUT') ||
|
||||||
|
error.message.includes('Protocol error') ||
|
||||||
|
error.message.includes('Navigation timeout') ||
|
||||||
|
error.message.includes('net::') ||
|
||||||
|
error.message.includes('Target closed') ||
|
||||||
|
error.message.includes('Session closed')
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!isRetryableError) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const delay = Math.min(
|
||||||
|
options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
|
||||||
|
options.maxDelay
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
|
||||||
|
await waitForTimeout(delay);
|
||||||
|
|
||||||
|
return retryWithBackoff(operation, options, attempt + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function closeBrowser(): Promise<void> {
|
export async function closeBrowser(): Promise<void> {
|
||||||
if (sharedBrowser && sharedBrowser.isConnected()) {
|
if (sharedBrowser && sharedBrowser.isConnected()) {
|
||||||
await sharedBrowser.close();
|
await sharedBrowser.close();
|
||||||
@ -159,24 +235,28 @@ export async function closeBrowser(): Promise<void> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function extractArticleContent(
|
async function extractWithRetry(url: string): Promise<ExtractedContent> {
|
||||||
url: string,
|
const userAgents = [
|
||||||
): Promise<ExtractedContent> {
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
console.log(`Starting content extraction for: ${url}`);
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
|
||||||
|
];
|
||||||
|
|
||||||
|
return retryWithBackoff(async () => {
|
||||||
let page = null;
|
let page = null;
|
||||||
try {
|
try {
|
||||||
const browser = await getBrowser();
|
const browser = await getBrowser();
|
||||||
page = await browser.newPage();
|
page = await browser.newPage();
|
||||||
|
|
||||||
// Set user agent and viewport
|
// Randomize user agent to avoid detection
|
||||||
await page.setUserAgent(
|
const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
await page.setUserAgent(userAgent!);
|
||||||
);
|
|
||||||
await page.setViewport({ width: 1280, height: 720 });
|
await page.setViewport({ width: 1280, height: 720 });
|
||||||
|
|
||||||
// Set navigation timeout and disable images for faster loading
|
// Set longer timeout for problematic sites
|
||||||
page.setDefaultNavigationTimeout(45000);
|
page.setDefaultNavigationTimeout(60000);
|
||||||
page.setDefaultTimeout(45000);
|
page.setDefaultTimeout(60000);
|
||||||
|
|
||||||
// Block unnecessary resources to speed up loading
|
// Block unnecessary resources to speed up loading
|
||||||
await page.setRequestInterception(true);
|
await page.setRequestInterception(true);
|
||||||
@ -189,14 +269,40 @@ export async function extractArticleContent(
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Navigate to the page with better waiting strategy
|
// Add extra headers to appear more like a real browser
|
||||||
const response = await page.goto(url, {
|
await page.setExtraHTTPHeaders({
|
||||||
waitUntil: "domcontentloaded",
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||||
timeout: 45000,
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'DNT': '1',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1'
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response || !response.ok()) {
|
// Navigate with multiple wait strategies
|
||||||
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
let response;
|
||||||
|
try {
|
||||||
|
response = await page.goto(url, {
|
||||||
|
waitUntil: ["domcontentloaded", "networkidle0"],
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
} catch (networkError) {
|
||||||
|
// Fallback to more basic wait strategy
|
||||||
|
const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
|
||||||
|
console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
|
||||||
|
response = await page.goto(url, {
|
||||||
|
waitUntil: "domcontentloaded",
|
||||||
|
timeout: 60000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response) {
|
||||||
|
throw new Error('No response received from server');
|
||||||
|
}
|
||||||
|
|
||||||
|
const status = response?.status();
|
||||||
|
if (status && status >= 400) {
|
||||||
|
throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enhanced dynamic content handling
|
// Enhanced dynamic content handling
|
||||||
@ -420,13 +526,13 @@ export async function extractArticleContent(
|
|||||||
|
|
||||||
console.log(`Found ${candidates.length} content candidates`);
|
console.log(`Found ${candidates.length} content candidates`);
|
||||||
if (candidates.length > 0) {
|
if (candidates.length > 0) {
|
||||||
console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
|
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the best content
|
// Get the best content
|
||||||
let content = "";
|
let content = "";
|
||||||
if (candidates.length > 0) {
|
if (candidates.length > 0) {
|
||||||
content = candidates[0].content;
|
content = candidates[0]!.content;
|
||||||
|
|
||||||
// If the best candidate is still short, try combining top candidates
|
// If the best candidate is still short, try combining top candidates
|
||||||
if (content.length < 500 && candidates.length > 1) {
|
if (content.length < 500 && candidates.length > 1) {
|
||||||
@ -513,9 +619,9 @@ export async function extractArticleContent(
|
|||||||
extractedData.content = fallbackData;
|
extractedData.content = fallbackData;
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
title: extractedData.title,
|
title: extractedData.title || '',
|
||||||
content: extractedData.content || "",
|
content: extractedData.content || "",
|
||||||
description: extractedData.description,
|
description: extractedData.description || '',
|
||||||
success: false,
|
success: false,
|
||||||
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
|
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
|
||||||
};
|
};
|
||||||
@ -537,18 +643,57 @@ export async function extractArticleContent(
|
|||||||
success: true,
|
success: true,
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Content extraction failed for ${url}:`, error);
|
console.error(`Content extraction attempt failed:`, error);
|
||||||
|
throw error; // Let retry logic handle this
|
||||||
|
} finally {
|
||||||
|
if (page) {
|
||||||
|
try {
|
||||||
|
await page.close();
|
||||||
|
} catch (closeError) {
|
||||||
|
console.warn('Failed to close page:', closeError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function extractArticleContent(
|
||||||
|
url: string,
|
||||||
|
): Promise<ExtractedContent> {
|
||||||
|
console.log(`Starting content extraction for: ${url}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await extractWithRetry(url);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Content extraction failed after all retries for ${url}:`, error);
|
||||||
|
|
||||||
|
// Provide more specific error messages
|
||||||
|
let errorMessage = "Unknown error occurred";
|
||||||
|
if (error instanceof Error) {
|
||||||
|
if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
|
||||||
|
errorMessage = "Network connection failed - server may be unreachable";
|
||||||
|
} else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
|
||||||
|
errorMessage = "Connection refused by server";
|
||||||
|
} else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
|
||||||
|
errorMessage = "DNS resolution failed - domain may not exist";
|
||||||
|
} else if (error.message.includes('ERR_TIMED_OUT')) {
|
||||||
|
errorMessage = "Request timed out - server too slow";
|
||||||
|
} else if (error.message.includes('HTTP 4')) {
|
||||||
|
errorMessage = `Client error: ${error.message}`;
|
||||||
|
} else if (error.message.includes('HTTP 5')) {
|
||||||
|
errorMessage = `Server error: ${error.message}`;
|
||||||
|
} else {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
title: "",
|
title: "",
|
||||||
content: "",
|
content: "",
|
||||||
description: "",
|
description: "",
|
||||||
success: false,
|
success: false,
|
||||||
error: error instanceof Error ? error.message : "Unknown error occurred",
|
error: errorMessage,
|
||||||
};
|
};
|
||||||
} finally {
|
|
||||||
if (page) {
|
|
||||||
await page.close();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user