From 886dc944299ea82fd410efb400cc6dabe0aae495 Mon Sep 17 00:00:00 2001 From: Satsuki Akiba Date: Thu, 12 Jun 2025 12:20:25 +0900 Subject: [PATCH] Update --- services/content-extractor.ts | 253 ++++++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 54 deletions(-) diff --git a/services/content-extractor.ts b/services/content-extractor.ts index 9c77d01..b01fcaa 100644 --- a/services/content-extractor.ts +++ b/services/content-extractor.ts @@ -8,6 +8,20 @@ export interface ExtractedContent { error?: string; } +interface RetryOptions { + maxRetries: number; + baseDelay: number; + maxDelay: number; + backoffMultiplier: number; +} + +const DEFAULT_RETRY_OPTIONS: RetryOptions = { + maxRetries: 3, + baseDelay: 1000, + maxDelay: 10000, + backoffMultiplier: 2 +}; + // Singleton browser instance for reuse let sharedBrowser: Browser | null = null; @@ -146,12 +160,74 @@ async function getBrowser(): Promise { "--disable-gpu", "--disable-web-security", "--disable-features=VizDisplayCompositor", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-renderer-backgrounding", + "--disable-field-trial-config", + "--disable-ipc-flooding-protection", + "--enable-automation", + "--force-device-scale-factor=1", + "--ignore-certificate-errors", + "--ignore-ssl-errors", + "--ignore-certificate-errors-spki-list", + "--allow-running-insecure-content", + "--disable-extensions", + "--no-default-browser-check", + "--disable-default-apps", + "--disable-sync", + "--metrics-recording-only", + "--no-pings", + "--mute-audio" ], }); } return sharedBrowser; } +// Helper function for exponential backoff retry +async function retryWithBackoff( + operation: () => Promise, + options: RetryOptions = DEFAULT_RETRY_OPTIONS, + attempt: number = 1 +): Promise { + try { + return await operation(); + } catch (error) { + if (attempt >= options.maxRetries) { + throw error; + } + + const isRetryableError = error instanceof Error && ( + error.message.includes('ERR_SOCKET_NOT_CONNECTED') || + error.message.includes('ERR_CONNECTION_REFUSED') || + error.message.includes('ERR_CONNECTION_RESET') || + error.message.includes('ERR_NETWORK_CHANGED') || + error.message.includes('ERR_INTERNET_DISCONNECTED') || + error.message.includes('ERR_NAME_NOT_RESOLVED') || + error.message.includes('ERR_TIMED_OUT') || + error.message.includes('Protocol error') || + error.message.includes('Navigation timeout') || + error.message.includes('net::') || + error.message.includes('Target closed') || + error.message.includes('Session closed') + ); + + if (!isRetryableError) { + throw error; + } + + const delay = Math.min( + options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1), + options.maxDelay + ); + + console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message); + await waitForTimeout(delay); + + return retryWithBackoff(operation, options, attempt + 1); + } +} + export async function closeBrowser(): Promise { if (sharedBrowser && sharedBrowser.isConnected()) { await sharedBrowser.close(); @@ -159,49 +235,79 @@ export async function closeBrowser(): Promise { } } -export async function extractArticleContent( - url: string, -): Promise { - console.log(`Starting content extraction for: ${url}`); - let page = null; - try { - const browser = await getBrowser(); - page = await browser.newPage(); +async function extractWithRetry(url: string): Promise { + const userAgents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0" + ]; - // Set user agent and viewport - await page.setUserAgent( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - ); - await page.setViewport({ width: 1280, height: 720 }); + return retryWithBackoff(async () => { + let page = null; + try { + const browser = await getBrowser(); + page = await browser.newPage(); - // Set navigation timeout and disable images for faster loading - page.setDefaultNavigationTimeout(45000); - page.setDefaultTimeout(45000); - - // Block unnecessary resources to speed up loading - await page.setRequestInterception(true); - page.on('request', (req) => { - const resourceType = req.resourceType(); - if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') { - req.abort(); - } else { - req.continue(); + // Randomize user agent to avoid detection + const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0]; + await page.setUserAgent(userAgent!); + await page.setViewport({ width: 1280, height: 720 }); + + // Set longer timeout for problematic sites + page.setDefaultNavigationTimeout(60000); + page.setDefaultTimeout(60000); + + // Block unnecessary resources to speed up loading + await page.setRequestInterception(true); + page.on('request', (req) => { + const resourceType = req.resourceType(); + if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') { + req.abort(); + } else { + req.continue(); + } + }); + + // Add extra headers to appear more like a real browser + await page.setExtraHTTPHeaders({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1' + }); + + // Navigate with multiple wait strategies + let response; + try { + response = await page.goto(url, { + waitUntil: ["domcontentloaded", "networkidle0"], + timeout: 60000, + }); + } catch (networkError) { + // Fallback to more basic wait strategy + const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error'; + console.log('Network idle failed, trying domcontentloaded only:', errorMessage); + response = await page.goto(url, { + waitUntil: "domcontentloaded", + timeout: 60000, + }); } - }); - // Navigate to the page with better waiting strategy - const response = await page.goto(url, { - waitUntil: "domcontentloaded", - timeout: 45000, - }); + if (!response) { + throw new Error('No response received from server'); + } - if (!response || !response.ok()) { - throw new Error(`HTTP ${response?.status()}: Failed to load page`); - } + const status = response?.status(); + if (status && status >= 400) { + throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`); + } - // Enhanced dynamic content handling - console.log('Handling dynamic content...'); - await handleDynamicContent(page); + // Enhanced dynamic content handling + console.log('Handling dynamic content...'); + await handleDynamicContent(page); // Extract content using advanced multi-strategy approach console.log('Extracting content using multi-strategy approach...'); @@ -420,13 +526,13 @@ export async function extractArticleContent( console.log(`Found ${candidates.length} content candidates`); if (candidates.length > 0) { - console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`); + console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`); } // Get the best content let content = ""; if (candidates.length > 0) { - content = candidates[0].content; + content = candidates[0]!.content; // If the best candidate is still short, try combining top candidates if (content.length < 500 && candidates.length > 1) { @@ -513,9 +619,9 @@ export async function extractArticleContent( extractedData.content = fallbackData; } else { return { - title: extractedData.title, + title: extractedData.title || '', content: extractedData.content || "", - description: extractedData.description, + description: extractedData.description || '', success: false, error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`, }; @@ -529,26 +635,65 @@ export async function extractArticleContent( content = content.substring(0, maxLength) + "..."; } - console.log(`Successfully extracted content: ${content.length} characters`); - return { - title: extractedData.title, - content, - description: extractedData.description, - success: true, - }; + console.log(`Successfully extracted content: ${content.length} characters`); + return { + title: extractedData.title, + content, + description: extractedData.description, + success: true, + }; + } catch (error) { + console.error(`Content extraction attempt failed:`, error); + throw error; // Let retry logic handle this + } finally { + if (page) { + try { + await page.close(); + } catch (closeError) { + console.warn('Failed to close page:', closeError); + } + } + } + }); +} + +export async function extractArticleContent( + url: string, +): Promise { + console.log(`Starting content extraction for: ${url}`); + + try { + return await extractWithRetry(url); } catch (error) { - console.error(`Content extraction failed for ${url}:`, error); + console.error(`Content extraction failed after all retries for ${url}:`, error); + + // Provide more specific error messages + let errorMessage = "Unknown error occurred"; + if (error instanceof Error) { + if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) { + errorMessage = "Network connection failed - server may be unreachable"; + } else if (error.message.includes('ERR_CONNECTION_REFUSED')) { + errorMessage = "Connection refused by server"; + } else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) { + errorMessage = "DNS resolution failed - domain may not exist"; + } else if (error.message.includes('ERR_TIMED_OUT')) { + errorMessage = "Request timed out - server too slow"; + } else if (error.message.includes('HTTP 4')) { + errorMessage = `Client error: ${error.message}`; + } else if (error.message.includes('HTTP 5')) { + errorMessage = `Server error: ${error.message}`; + } else { + errorMessage = error.message; + } + } + return { title: "", content: "", description: "", success: false, - error: error instanceof Error ? error.message : "Unknown error occurred", + error: errorMessage, }; - } finally { - if (page) { - await page.close(); - } } }