import puppeteer, { type Browser } from "puppeteer"; import * as cheerio from "cheerio"; import type { CheerioAPI } from "cheerio"; export interface ExtractedContent { title?: string; content: string; description?: string; success: boolean; error?: string; } interface RetryOptions { maxRetries: number; baseDelay: number; maxDelay: number; backoffMultiplier: number; } const DEFAULT_RETRY_OPTIONS: RetryOptions = { maxRetries: 3, baseDelay: 1000, maxDelay: 10000, backoffMultiplier: 2 }; // Singleton browser instance for reuse let sharedBrowser: Browser | null = null; // Helper function to replace page.waitForTimeout async function waitForTimeout(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // Dynamic content handling function async function handleDynamicContent(page: any): Promise { try { console.log('Starting dynamic content handling...'); // Wait for initial content await page.waitForSelector('body', { timeout: 5000 }); // Progressive loading strategy const loadingStrategies = [ // Strategy 1: Wait for common loading indicators to disappear async () => { const loadingSelectors = [ '.loading', '.loader', '.spinner', '.skeleton', '[class*="loading"]', '[class*="skeleton"]', '.placeholder', '.shimmer' ]; for (const selector of loadingSelectors) { try { await page.waitForSelector(selector, { timeout: 2000 }); await page.waitForSelector(selector, { hidden: true, timeout: 10000 }); break; } catch (e) { // Continue to next selector } } }, // Strategy 2: Auto-scroll to trigger lazy loading async () => { await page.evaluate(() => { return new Promise((resolve) => { let totalHeight = 0; const distance = 500; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight || totalHeight > 5000) { clearInterval(timer); window.scrollTo(0, 0); // Scroll back to top setTimeout(() => resolve(), 1000); } }, 200); }); }); }, // Strategy 3: Wait for content-specific indicators async () => { const contentSelectors = [ 'article', '.article-content', '.post-content', '.entry-content', 'main', '[role="main"]', '.main-content' ]; for (const selector of contentSelectors) { try { await page.waitForSelector(selector, { timeout: 3000 }); break; } catch (e) { // Continue to next selector } } }, // Strategy 4: Handle "Read More" or expansion buttons async () => { const expandButtons = [ 'button[class*="read-more"]', 'button[class*="expand"]', '.read-more', '.show-more', '.expand-content', 'a[class*="read-more"]', 'a[class*="continue"]' ]; for (const selector of expandButtons) { try { const button = await page.$(selector); if (button) { await button.click(); await waitForTimeout(2000); break; } } catch (e) { // Continue to next button } } } ]; // Execute strategies with timeouts const executeWithTimeout = async (strategy: () => Promise, timeout: number) => { return Promise.race([ strategy(), new Promise((resolve) => setTimeout(resolve, timeout)) ]); }; // Execute all strategies in parallel with timeouts await Promise.allSettled([ executeWithTimeout(loadingStrategies[0]!, 3000), executeWithTimeout(loadingStrategies[1]!, 8000), executeWithTimeout(loadingStrategies[2]!, 5000), executeWithTimeout(loadingStrategies[3]!, 3000) ]); // Final wait for any remaining dynamic content await waitForTimeout(2000); } catch (error) { console.log('Dynamic content handling failed, using basic timeout:', error); // If dynamic content handling fails, continue with basic timeout await waitForTimeout(3000); } console.log('Dynamic content handling completed.'); } async function getBrowser(): Promise { if (!sharedBrowser || !sharedBrowser.isConnected()) { sharedBrowser = await puppeteer.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--no-first-run", "--no-zygote", "--disable-gpu", "--disable-web-security", "--disable-features=VizDisplayCompositor", "--disable-background-timer-throttling", "--disable-backgrounding-occluded-windows", "--disable-renderer-backgrounding", "--disable-field-trial-config", "--disable-ipc-flooding-protection", "--enable-automation", "--force-device-scale-factor=1", "--ignore-certificate-errors", "--ignore-ssl-errors", "--ignore-certificate-errors-spki-list", "--allow-running-insecure-content", "--disable-extensions", "--no-default-browser-check", "--disable-default-apps", "--disable-sync", "--metrics-recording-only", "--no-pings", "--mute-audio" ], }); } return sharedBrowser; } // Helper function for exponential backoff retry async function retryWithBackoff( operation: () => Promise, options: RetryOptions = DEFAULT_RETRY_OPTIONS, attempt: number = 1 ): Promise { try { return await operation(); } catch (error) { if (attempt >= options.maxRetries) { throw error; } const isRetryableError = error instanceof Error && ( error.message.includes('ERR_SOCKET_NOT_CONNECTED') || error.message.includes('ERR_CONNECTION_REFUSED') || error.message.includes('ERR_CONNECTION_RESET') || error.message.includes('ERR_NETWORK_CHANGED') || error.message.includes('ERR_INTERNET_DISCONNECTED') || error.message.includes('ERR_NAME_NOT_RESOLVED') || error.message.includes('ERR_TIMED_OUT') || error.message.includes('Protocol error') || error.message.includes('Navigation timeout') || error.message.includes('net::') || error.message.includes('Target closed') || error.message.includes('Session closed') ); if (!isRetryableError) { throw error; } const delay = Math.min( options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1), options.maxDelay ); console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message); await waitForTimeout(delay); return retryWithBackoff(operation, options, attempt + 1); } } export async function closeBrowser(): Promise { if (sharedBrowser && sharedBrowser.isConnected()) { await sharedBrowser.close(); sharedBrowser = null; } } // Fallback content extraction using fetch + cheerio async function extractWithFetchFallback(url: string): Promise { console.log(`Using fetch fallback for: ${url}`); try { const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; const response = await fetch(url, { headers: { 'User-Agent': userAgent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'no-cache' }, signal: AbortSignal.timeout(30000) // 30 second timeout }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = cheerio.load(html); // Remove unwanted elements first const unwantedSelectors = [ "script", "style", "noscript", "iframe", "embed", "object", "nav", "header", "footer", "aside", "form", ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']", ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb", ".social-share", ".share", ".social", ".sns", ".comments", ".comment", ".disqus", ".cookie-banner", ".cookie", ".gdpr", ".popup", ".modal", ".overlay", ".lightbox", ".related", ".recommended", ".more-stories", ".tags", ".categories", ".metadata", ".author-bio", ".author-info", ".newsletter", ".subscribe", ".signup", "[role='complementary']", "[role='banner']", "[role='contentinfo']", "[aria-label*='advertisement']", "[aria-label*='sidebar']" ]; unwantedSelectors.forEach((selector) => { $(selector).remove(); }); // Extract title let title = ""; const titleSources = [ $('meta[property="og:title"]').attr('content'), $('meta[name="twitter:title"]').attr('content'), $('h1').first().text().trim(), $('.article-title, .post-title, .entry-title').first().text().trim(), $('title').text().trim(), $('[itemprop="headline"]').first().text().trim() ]; for (const titleSource of titleSources) { if (titleSource && titleSource.length > 0) { title = titleSource; break; } } // Extract description let description = ""; const descriptionSources = [ $('meta[property="og:description"]').attr('content'), $('meta[name="description"]').attr('content'), $('meta[name="twitter:description"]').attr('content'), $('[itemprop="description"]').first().text().trim() ]; for (const descSource of descriptionSources) { if (descSource && descSource.length > 0) { description = descSource; break; } } // Content selectors (same as in Puppeteer version) const contentSelectors = [ '[itemtype*="Article"] [itemprop="articleBody"]', '[itemtype*="NewsArticle"] [itemprop="articleBody"]', '[itemtype*="BlogPosting"] [itemprop="articleBody"]', 'article[role="main"]', 'main article', '[role="main"] article', 'article', '.post-content', '.entry-content', '.article-content', '.content-area', '.article-body', '.post-body', '.entry-body', '.story-body', '.main-content', '.primary-content', '.page-content', '.news-content', '.blog-content', '.editorial-content', '.wp-content', '.entry', '.post', '.section-content', '.postArticle-content', '.post-full-content', '.markup', '.section--body', '.section-divider + .section-content', '.honbun', '.main_text', '.article_body', '.news_body', '.entry_text', '.blog_text', '.content_text', '.kiji', '.news', '.article', 'main', '[role="main"]', '#content', '#main', '#article', '#post', '#entry', '#main-content', '#primary', '#content-area', '.content', '.main', '.wrapper', '.container' ]; // Function to calculate content quality score const calculateContentScore = (element: cheerio.Cheerio): number => { const text = element.text() || ''; if (text.length < 100) return 0; let score = 0; // Base score from text length (diminishing returns) score += Math.min(text.length / 100, 50); // Paragraph density const paragraphs = element.find('p'); const avgParagraphLength = paragraphs.length > 0 ? paragraphs.toArray().reduce((sum, p) => sum + ($(p).text().length || 0), 0) / paragraphs.length : 0; if (avgParagraphLength > 100) score += 20; if (paragraphs.length > 3) score += 10; // Link density penalty const links = element.find('a'); const linkText = links.toArray().reduce((sum, link) => sum + ($(link).text().length || 0), 0); const linkDensity = text.length > 0 ? linkText / text.length : 0; if (linkDensity < 0.2) score += 15; else if (linkDensity < 0.4) score += 5; else score -= 10; // Bonus for article-like structure if (element.prop('tagName') === 'ARTICLE') score += 25; if (element.attr('role') === 'main') score += 20; if (element.find('h1, h2, h3').length > 0) score += 10; // Bonus for semantic elements const semanticElements = element.find('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol'); if (semanticElements.length > 5) score += 15; // Penalty for navigation-like content const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar']; const className = (element.attr('class') || '').toLowerCase(); const id = (element.attr('id') || '').toLowerCase(); if (navWords.some(word => className.includes(word) || id.includes(word))) { score -= 20; } return Math.max(score, 0); }; // Function to clean and normalize text const cleanText = (text: string): string => { return text .replace(/\s+/g, ' ') .replace(/\n\s*\n\s*\n/g, '\n\n') .replace(/^\s+|\s+$/g, '') .replace(/[\u200B-\u200D\uFEFF]/g, '') .trim(); }; // Collect and score all content candidates interface ContentCandidate { element: cheerio.Cheerio; score: number; content: string; selector: string; } const candidates: ContentCandidate[] = []; for (const selector of contentSelectors) { try { const elements = $(selector); elements.each((index, element) => { const $element = $(element); const text = $element.text() || ''; if (text.length > 200) { const score = calculateContentScore($element); candidates.push({ element: $element, score, content: cleanText(text), selector: `${selector}[${index}]` }); } }); } catch (e) { continue; } } // Sort candidates by score (highest first) candidates.sort((a, b) => b.score - a.score); console.log(`Found ${candidates.length} content candidates`); if (candidates.length > 0) { console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`); } // Get the best content let content = ""; if (candidates.length > 0) { content = candidates[0]!.content; // If the best candidate is still short, try combining top candidates if (content.length < 500 && candidates.length > 1) { const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10); const combinedContent = topCandidates.map(c => c.content).join('\n\n'); if (combinedContent.length > content.length) { content = cleanText(combinedContent); } } } // Fallback strategies if still no good content if (!content || content.length < 200) { console.log('Using paragraph aggregation fallback...'); const paragraphs = $('p').toArray() .map(p => $(p).text().trim()) .filter(p => p.length > 50) .filter(p => { const lowerP = p.toLowerCase(); return !lowerP.includes('cookie') && !lowerP.includes('privacy') && !lowerP.includes('terms of service') && !lowerP.includes('subscribe') && !lowerP.includes('newsletter'); }); if (paragraphs.length > 0) { content = cleanText(paragraphs.join('\n\n')); } } // Final fallback: structured data if (!content || content.length < 200) { console.log('Trying structured data fallback...'); try { const jsonLd = $('script[type="application/ld+json"]').first().html(); if (jsonLd) { const data = JSON.parse(jsonLd); if (data.articleBody) { content = cleanText(data.articleBody); } else if (data.text) { content = cleanText(data.text); } } } catch (e) { // Ignore JSON parsing errors } } // Limit content length to avoid token limits const maxLength = 50000; if (content.length > maxLength) { content = content.substring(0, maxLength) + "..."; } console.log(`Fetch fallback extracted content: ${content.length} characters`); if (!content || content.length < 100) { return { title: title || '', content: '', description: description || '', success: false, error: `Insufficient content extracted via fetch fallback (${content?.length || 0} characters)`, }; } return { title: title || '', content, description: description || '', success: true, }; } catch (error) { console.error(`Fetch fallback failed:`, error); return { title: '', content: '', description: '', success: false, error: error instanceof Error ? error.message : 'Unknown error in fetch fallback', }; } } async function extractWithRetry(url: string): Promise { const userAgents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0" ]; return retryWithBackoff(async () => { let page = null; try { const browser = await getBrowser(); page = await browser.newPage(); // Randomize user agent to avoid detection const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0]; await page.setUserAgent(userAgent!); await page.setViewport({ width: 1280, height: 720 }); // Set longer timeout for problematic sites page.setDefaultNavigationTimeout(60000); page.setDefaultTimeout(60000); // Block unnecessary resources to speed up loading await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') { req.abort(); } else { req.continue(); } }); // Add extra headers to appear more like a real browser await page.setExtraHTTPHeaders({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }); // Navigate with multiple wait strategies let response; try { response = await page.goto(url, { waitUntil: ["domcontentloaded", "networkidle0"], timeout: 60000, }); } catch (networkError) { // Fallback to more basic wait strategy const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error'; console.log('Network idle failed, trying domcontentloaded only:', errorMessage); response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000, }); } if (!response) { throw new Error('No response received from server'); } const status = response?.status(); if (status && status >= 400) { throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`); } // Enhanced dynamic content handling console.log('Handling dynamic content...'); await handleDynamicContent(page); // Extract content using advanced multi-strategy approach console.log('Extracting content using multi-strategy approach...'); const extractedData = await page.evaluate(() => { interface ContentCandidate { element: Element; score: number; content: string; selector: string; } // Remove unwanted elements first const unwantedSelectors = [ "script", "style", "noscript", "iframe", "embed", "object", "nav", "header", "footer", "aside", "form", ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']", ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb", ".social-share", ".share", ".social", ".sns", ".comments", ".comment", ".disqus", ".cookie-banner", ".cookie", ".gdpr", ".popup", ".modal", ".overlay", ".lightbox", ".related", ".recommended", ".more-stories", ".tags", ".categories", ".metadata", ".author-bio", ".author-info", ".newsletter", ".subscribe", ".signup", "[role='complementary']", "[role='banner']", "[role='contentinfo']", "[aria-label*='advertisement']", "[aria-label*='sidebar']" ]; unwantedSelectors.forEach((selector) => { try { const elements = document.querySelectorAll(selector); elements.forEach((el) => el.remove()); } catch (e) { // Ignore invalid selectors } }); // Extract title let title = ""; const titleSources = [ () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'), () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'), () => document.querySelector('h1')?.textContent?.trim(), () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(), () => document.querySelector('title')?.textContent?.trim(), () => document.querySelector('[itemprop="headline"]')?.textContent?.trim() ]; for (const source of titleSources) { try { const result = source(); if (result && result.length > 0) { title = result; break; } } catch (e) { continue; } } // Extract description let description = ""; const descriptionSources = [ () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'), () => document.querySelector('meta[name="description"]')?.getAttribute('content'), () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'), () => document.querySelector('[itemprop="description"]')?.textContent?.trim() ]; for (const source of descriptionSources) { try { const result = source(); if (result && result.length > 0) { description = result; break; } } catch (e) { continue; } } // Comprehensive content selectors with priorities const contentSelectors = [ // Schema.org and structured data '[itemtype*="Article"] [itemprop="articleBody"]', '[itemtype*="NewsArticle"] [itemprop="articleBody"]', '[itemtype*="BlogPosting"] [itemprop="articleBody"]', // High-priority semantic selectors 'article[role="main"]', 'main article', '[role="main"] article', 'article', // Common CMS and platform selectors '.post-content', '.entry-content', '.article-content', '.content-area', '.article-body', '.post-body', '.entry-body', '.story-body', '.main-content', '.primary-content', '.page-content', '.news-content', '.blog-content', '.editorial-content', // WordPress specific '.wp-content', '.entry', '.post', // Medium, Substack, Ghost '.section-content', '.postArticle-content', '.post-full-content', '.markup', '.section--body', '.section-divider + .section-content', // Japanese sites specific '.honbun', '.main_text', '.article_body', '.news_body', '.entry_text', '.blog_text', '.content_text', '.kiji', '.news', '.article', // Generic semantic HTML5 'main', '[role="main"]', // ID-based selectors '#content', '#main', '#article', '#post', '#entry', '#main-content', '#primary', '#content-area', // Class-based common patterns '.content', '.main', '.wrapper', '.container', // Fallbacks 'body' ]; // Function to calculate content quality score function calculateContentScore(element: Element): number { if (!element) return 0; const text = element.textContent || ''; if (text.length < 100) return 0; let score = 0; // Base score from text length (diminishing returns) score += Math.min(text.length / 100, 50); // Paragraph density const paragraphs = element.querySelectorAll('p'); const avgParagraphLength = paragraphs.length > 0 ? Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0; if (avgParagraphLength > 100) score += 20; if (paragraphs.length > 3) score += 10; // Link density penalty (articles shouldn't be mostly links) const links = element.querySelectorAll('a'); const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0); const linkDensity = text.length > 0 ? linkText / text.length : 0; if (linkDensity < 0.2) score += 15; else if (linkDensity < 0.4) score += 5; else score -= 10; // Bonus for article-like structure if (element.tagName === 'ARTICLE') score += 25; if (element.getAttribute('role') === 'main') score += 20; if (element.querySelector('h1, h2, h3')) score += 10; // Bonus for semantic elements const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol'); if (semanticElements.length > 5) score += 15; // Penalty for too many images without text const images = element.querySelectorAll('img'); if (images.length > text.length / 500) score -= 5; // Penalty for navigation-like content const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar']; const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); if (navWords.some(word => className.includes(word) || id.includes(word))) { score -= 20; } return Math.max(score, 0); } // Function to clean and normalize text function cleanText(text: string): string { return text .replace(/\s+/g, ' ') // Normalize whitespace .replace(/\n\s*\n\s*\n/g, '\n\n') // Reduce excessive line breaks .replace(/^\s+|\s+$/g, '') // Trim .replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width characters .trim(); } // Collect and score all content candidates const candidates: ContentCandidate[] = []; for (const selector of contentSelectors) { try { const elements = document.querySelectorAll(selector); elements.forEach((element, index) => { const text = element.textContent || ''; if (text.length > 200) { // Minimum content threshold const score = calculateContentScore(element); candidates.push({ element, score, content: cleanText(text), selector: `${selector}[${index}]` }); } }); } catch (e) { // Skip invalid selectors continue; } } // Sort candidates by score (highest first) candidates.sort((a, b) => b.score - a.score); console.log(`Found ${candidates.length} content candidates`); if (candidates.length > 0) { console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`); } // Get the best content let content = ""; if (candidates.length > 0) { content = candidates[0]!.content; // If the best candidate is still short, try combining top candidates if (content.length < 500 && candidates.length > 1) { const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10); const combinedContent = topCandidates.map(c => c.content).join('\n\n'); if (combinedContent.length > content.length) { content = cleanText(combinedContent); } } } // Fallback strategies if still no good content if (!content || content.length < 200) { // Try paragraph aggregation const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => p.textContent?.trim() || '') .filter(p => p.length > 50) .filter(p => { // Filter out likely navigation/boilerplate paragraphs const lowerP = p.toLowerCase(); return !lowerP.includes('cookie') && !lowerP.includes('privacy') && !lowerP.includes('terms of service') && !lowerP.includes('subscribe') && !lowerP.includes('newsletter'); }); if (paragraphs.length > 0) { content = cleanText(paragraphs.join('\n\n')); } } // Final fallback: structured data if (!content || content.length < 200) { try { const jsonLd = document.querySelector('script[type="application/ld+json"]'); if (jsonLd) { const data = JSON.parse(jsonLd.textContent || '{}'); if (data.articleBody) { content = cleanText(data.articleBody); } else if (data.text) { content = cleanText(data.text); } } } catch (e) { // Ignore JSON parsing errors } } console.log(`Final content length: ${content.length} characters`); return { title, content, description }; }); // Validate extracted content with more lenient threshold if (!extractedData.content || extractedData.content.length < 100) { // Try one more extraction attempt with relaxed criteria const fallbackData = await page.evaluate(() => { // Last resort: extract all text from body, excluding common noise const body = document.body; if (body) { // Clone body to avoid modifying original const bodyClone = body.cloneNode(true) as Element; // Remove noise elements from clone const noiseSelectors = [ 'script', 'style', 'nav', 'header', 'footer', 'aside', '.ad', '.ads', '.advertisement', '[class*="ad-"]', '.menu', '.navigation', '.sidebar', '.social', '.cookie', '.popup', '.modal' ]; noiseSelectors.forEach(selector => { const elements = bodyClone.querySelectorAll(selector); elements.forEach(el => el.remove()); }); const text = bodyClone.textContent || ''; return text.replace(/\s+/g, ' ').trim(); } return ''; }); if (fallbackData && fallbackData.length > 200) { extractedData.content = fallbackData; } else { return { title: extractedData.title || '', content: extractedData.content || "", description: extractedData.description || '', success: false, error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`, }; } } // Limit content length to avoid token limits const maxLength = 50000; let content = extractedData.content; if (content.length > maxLength) { content = content.substring(0, maxLength) + "..."; } console.log(`Successfully extracted content: ${content.length} characters`); return { title: extractedData.title, content, description: extractedData.description, success: true, }; } catch (error) { console.error(`Content extraction attempt failed:`, error); throw error; // Let retry logic handle this } finally { if (page) { try { await page.close(); } catch (closeError) { console.warn('Failed to close page:', closeError); } } } }); } export async function extractArticleContent( url: string, ): Promise { console.log(`Starting content extraction for: ${url}`); try { return await extractWithRetry(url); } catch (error) { console.error(`Content extraction failed after all retries for ${url}:`, error); // Check if this is a Puppeteer launch/browser failure that should trigger fallback const shouldUseFallback = error instanceof Error && ( error.message.includes('TimeoutError') || error.message.includes('Timed out after') || error.message.includes('waiting for the WS endpoint URL') || error.message.includes('Browser closed') || error.message.includes('Target closed') || error.message.includes('Session closed') || error.message.includes('Protocol error') || error.message.includes('Connection terminated') || error.message.includes('spawn') || // Process spawn errors error.message.includes('ECONNRESET') || error.message.includes('ECONNREFUSED') || error.message.includes('ENOTFOUND') ); if (shouldUseFallback) { console.log(`Puppeteer failed, trying fetch fallback for ${url}`); try { const fallbackResult = await extractWithFetchFallback(url); if (fallbackResult.success) { console.log(`Fetch fallback succeeded for ${url}`); return fallbackResult; } console.log(`Fetch fallback also failed for ${url}:`, fallbackResult.error); } catch (fallbackError) { console.error(`Fetch fallback threw error for ${url}:`, fallbackError); } } // Provide more specific error messages let errorMessage = "Unknown error occurred"; if (error instanceof Error) { if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) { errorMessage = "Network connection failed - server may be unreachable"; } else if (error.message.includes('ERR_CONNECTION_REFUSED')) { errorMessage = "Connection refused by server"; } else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) { errorMessage = "DNS resolution failed - domain may not exist"; } else if (error.message.includes('ERR_TIMED_OUT')) { errorMessage = "Request timed out - server too slow"; } else if (error.message.includes('HTTP 4')) { errorMessage = `Client error: ${error.message}`; } else if (error.message.includes('HTTP 5')) { errorMessage = `Server error: ${error.message}`; } else if (error.message.includes('TimeoutError')) { errorMessage = "Puppeteer browser launch timeout - both Puppeteer and fetch fallback failed"; } else { errorMessage = error.message; } } return { title: "", content: "", description: "", success: false, error: errorMessage, }; } } export async function enhanceArticleContent( _originalTitle: string, originalLink: string, originalContent?: string, originalDescription?: string, ): Promise<{ content?: string; description?: string }> { // If we already have substantial content, use it const existingContent = originalContent || originalDescription || ""; if (existingContent.length > 500) { return { content: originalContent, description: originalDescription, }; } // Try to extract content from the URL const extracted = await extractArticleContent(originalLink); if (extracted.success && extracted.content) { return { content: extracted.content, description: extracted.description || originalDescription, }; } // Return original content if extraction failed return { content: originalContent, description: originalDescription, }; }