import puppeteer, { type Browser } from "puppeteer"; export interface ExtractedContent { title?: string; content: string; description?: string; success: boolean; error?: string; } // Singleton browser instance for reuse let sharedBrowser: Browser | null = null; // Helper function to replace page.waitForTimeout async function waitForTimeout(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // Dynamic content handling function async function handleDynamicContent(page: any): Promise { try { console.log('Starting dynamic content handling...'); // Wait for initial content await page.waitForSelector('body', { timeout: 5000 }); // Progressive loading strategy const loadingStrategies = [ // Strategy 1: Wait for common loading indicators to disappear async () => { const loadingSelectors = [ '.loading', '.loader', '.spinner', '.skeleton', '[class*="loading"]', '[class*="skeleton"]', '.placeholder', '.shimmer' ]; for (const selector of loadingSelectors) { try { await page.waitForSelector(selector, { timeout: 2000 }); await page.waitForSelector(selector, { hidden: true, timeout: 10000 }); break; } catch (e) { // Continue to next selector } } }, // Strategy 2: Auto-scroll to trigger lazy loading async () => { await page.evaluate(() => { return new Promise((resolve) => { let totalHeight = 0; const distance = 500; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight || totalHeight > 5000) { clearInterval(timer); window.scrollTo(0, 0); // Scroll back to top setTimeout(() => resolve(), 1000); } }, 200); }); }); }, // Strategy 3: Wait for content-specific indicators async () => { const contentSelectors = [ 'article', '.article-content', '.post-content', '.entry-content', 'main', '[role="main"]', '.main-content' ]; for (const selector of contentSelectors) { try { await page.waitForSelector(selector, { timeout: 3000 }); break; } catch (e) { // Continue to next selector } } }, // Strategy 4: Handle "Read More" or expansion buttons async () => { const expandButtons = [ 'button[class*="read-more"]', 'button[class*="expand"]', '.read-more', '.show-more', '.expand-content', 'a[class*="read-more"]', 'a[class*="continue"]' ]; for (const selector of expandButtons) { try { const button = await page.$(selector); if (button) { await button.click(); await waitForTimeout(2000); break; } } catch (e) { // Continue to next button } } } ]; // Execute strategies with timeouts const executeWithTimeout = async (strategy: () => Promise, timeout: number) => { return Promise.race([ strategy(), new Promise((resolve) => setTimeout(resolve, timeout)) ]); }; // Execute all strategies in parallel with timeouts await Promise.allSettled([ executeWithTimeout(loadingStrategies[0]!, 3000), executeWithTimeout(loadingStrategies[1]!, 8000), executeWithTimeout(loadingStrategies[2]!, 5000), executeWithTimeout(loadingStrategies[3]!, 3000) ]); // Final wait for any remaining dynamic content await waitForTimeout(2000); } catch (error) { console.log('Dynamic content handling failed, using basic timeout:', error); // If dynamic content handling fails, continue with basic timeout await waitForTimeout(3000); } console.log('Dynamic content handling completed.'); } async function getBrowser(): Promise { if (!sharedBrowser || !sharedBrowser.isConnected()) { sharedBrowser = await puppeteer.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--no-first-run", "--no-zygote", "--disable-gpu", "--disable-web-security", "--disable-features=VizDisplayCompositor", ], }); } return sharedBrowser; } export async function closeBrowser(): Promise { if (sharedBrowser && sharedBrowser.isConnected()) { await sharedBrowser.close(); sharedBrowser = null; } } export async function extractArticleContent( url: string, ): Promise { console.log(`Starting content extraction for: ${url}`); let page = null; try { const browser = await getBrowser(); page = await browser.newPage(); // Set user agent and viewport await page.setUserAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", ); await page.setViewport({ width: 1280, height: 720 }); // Set navigation timeout and disable images for faster loading page.setDefaultNavigationTimeout(45000); page.setDefaultTimeout(45000); // Block unnecessary resources to speed up loading await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') { req.abort(); } else { req.continue(); } }); // Navigate to the page with better waiting strategy const response = await page.goto(url, { waitUntil: "domcontentloaded", timeout: 45000, }); if (!response || !response.ok()) { throw new Error(`HTTP ${response?.status()}: Failed to load page`); } // Enhanced dynamic content handling console.log('Handling dynamic content...'); await handleDynamicContent(page); // Extract content using advanced multi-strategy approach console.log('Extracting content using multi-strategy approach...'); const extractedData = await page.evaluate(() => { interface ContentCandidate { element: Element; score: number; content: string; selector: string; } // Remove unwanted elements first const unwantedSelectors = [ "script", "style", "noscript", "iframe", "embed", "object", "nav", "header", "footer", "aside", "form", ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']", ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb", ".social-share", ".share", ".social", ".sns", ".comments", ".comment", ".disqus", ".cookie-banner", ".cookie", ".gdpr", ".popup", ".modal", ".overlay", ".lightbox", ".related", ".recommended", ".more-stories", ".tags", ".categories", ".metadata", ".author-bio", ".author-info", ".newsletter", ".subscribe", ".signup", "[role='complementary']", "[role='banner']", "[role='contentinfo']", "[aria-label*='advertisement']", "[aria-label*='sidebar']" ]; unwantedSelectors.forEach((selector) => { try { const elements = document.querySelectorAll(selector); elements.forEach((el) => el.remove()); } catch (e) { // Ignore invalid selectors } }); // Extract title let title = ""; const titleSources = [ () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'), () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'), () => document.querySelector('h1')?.textContent?.trim(), () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(), () => document.querySelector('title')?.textContent?.trim(), () => document.querySelector('[itemprop="headline"]')?.textContent?.trim() ]; for (const source of titleSources) { try { const result = source(); if (result && result.length > 0) { title = result; break; } } catch (e) { continue; } } // Extract description let description = ""; const descriptionSources = [ () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'), () => document.querySelector('meta[name="description"]')?.getAttribute('content'), () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'), () => document.querySelector('[itemprop="description"]')?.textContent?.trim() ]; for (const source of descriptionSources) { try { const result = source(); if (result && result.length > 0) { description = result; break; } } catch (e) { continue; } } // Comprehensive content selectors with priorities const contentSelectors = [ // Schema.org and structured data '[itemtype*="Article"] [itemprop="articleBody"]', '[itemtype*="NewsArticle"] [itemprop="articleBody"]', '[itemtype*="BlogPosting"] [itemprop="articleBody"]', // High-priority semantic selectors 'article[role="main"]', 'main article', '[role="main"] article', 'article', // Common CMS and platform selectors '.post-content', '.entry-content', '.article-content', '.content-area', '.article-body', '.post-body', '.entry-body', '.story-body', '.main-content', '.primary-content', '.page-content', '.news-content', '.blog-content', '.editorial-content', // WordPress specific '.wp-content', '.entry', '.post', // Medium, Substack, Ghost '.section-content', '.postArticle-content', '.post-full-content', '.markup', '.section--body', '.section-divider + .section-content', // Japanese sites specific '.honbun', '.main_text', '.article_body', '.news_body', '.entry_text', '.blog_text', '.content_text', '.kiji', '.news', '.article', // Generic semantic HTML5 'main', '[role="main"]', // ID-based selectors '#content', '#main', '#article', '#post', '#entry', '#main-content', '#primary', '#content-area', // Class-based common patterns '.content', '.main', '.wrapper', '.container', // Fallbacks 'body' ]; // Function to calculate content quality score function calculateContentScore(element: Element): number { if (!element) return 0; const text = element.textContent || ''; if (text.length < 100) return 0; let score = 0; // Base score from text length (diminishing returns) score += Math.min(text.length / 100, 50); // Paragraph density const paragraphs = element.querySelectorAll('p'); const avgParagraphLength = paragraphs.length > 0 ? Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0; if (avgParagraphLength > 100) score += 20; if (paragraphs.length > 3) score += 10; // Link density penalty (articles shouldn't be mostly links) const links = element.querySelectorAll('a'); const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0); const linkDensity = text.length > 0 ? linkText / text.length : 0; if (linkDensity < 0.2) score += 15; else if (linkDensity < 0.4) score += 5; else score -= 10; // Bonus for article-like structure if (element.tagName === 'ARTICLE') score += 25; if (element.getAttribute('role') === 'main') score += 20; if (element.querySelector('h1, h2, h3')) score += 10; // Bonus for semantic elements const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol'); if (semanticElements.length > 5) score += 15; // Penalty for too many images without text const images = element.querySelectorAll('img'); if (images.length > text.length / 500) score -= 5; // Penalty for navigation-like content const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar']; const className = element.className.toLowerCase(); const id = element.id.toLowerCase(); if (navWords.some(word => className.includes(word) || id.includes(word))) { score -= 20; } return Math.max(score, 0); } // Function to clean and normalize text function cleanText(text: string): string { return text .replace(/\s+/g, ' ') // Normalize whitespace .replace(/\n\s*\n\s*\n/g, '\n\n') // Reduce excessive line breaks .replace(/^\s+|\s+$/g, '') // Trim .replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width characters .trim(); } // Collect and score all content candidates const candidates: ContentCandidate[] = []; for (const selector of contentSelectors) { try { const elements = document.querySelectorAll(selector); elements.forEach((element, index) => { const text = element.textContent || ''; if (text.length > 200) { // Minimum content threshold const score = calculateContentScore(element); candidates.push({ element, score, content: cleanText(text), selector: `${selector}[${index}]` }); } }); } catch (e) { // Skip invalid selectors continue; } } // Sort candidates by score (highest first) candidates.sort((a, b) => b.score - a.score); console.log(`Found ${candidates.length} content candidates`); if (candidates.length > 0) { console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`); } // Get the best content let content = ""; if (candidates.length > 0) { content = candidates[0].content; // If the best candidate is still short, try combining top candidates if (content.length < 500 && candidates.length > 1) { const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10); const combinedContent = topCandidates.map(c => c.content).join('\n\n'); if (combinedContent.length > content.length) { content = cleanText(combinedContent); } } } // Fallback strategies if still no good content if (!content || content.length < 200) { // Try paragraph aggregation const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => p.textContent?.trim() || '') .filter(p => p.length > 50) .filter(p => { // Filter out likely navigation/boilerplate paragraphs const lowerP = p.toLowerCase(); return !lowerP.includes('cookie') && !lowerP.includes('privacy') && !lowerP.includes('terms of service') && !lowerP.includes('subscribe') && !lowerP.includes('newsletter'); }); if (paragraphs.length > 0) { content = cleanText(paragraphs.join('\n\n')); } } // Final fallback: structured data if (!content || content.length < 200) { try { const jsonLd = document.querySelector('script[type="application/ld+json"]'); if (jsonLd) { const data = JSON.parse(jsonLd.textContent || '{}'); if (data.articleBody) { content = cleanText(data.articleBody); } else if (data.text) { content = cleanText(data.text); } } } catch (e) { // Ignore JSON parsing errors } } console.log(`Final content length: ${content.length} characters`); return { title, content, description }; }); // Validate extracted content with more lenient threshold if (!extractedData.content || extractedData.content.length < 100) { // Try one more extraction attempt with relaxed criteria const fallbackData = await page.evaluate(() => { // Last resort: extract all text from body, excluding common noise const body = document.body; if (body) { // Clone body to avoid modifying original const bodyClone = body.cloneNode(true) as Element; // Remove noise elements from clone const noiseSelectors = [ 'script', 'style', 'nav', 'header', 'footer', 'aside', '.ad', '.ads', '.advertisement', '[class*="ad-"]', '.menu', '.navigation', '.sidebar', '.social', '.cookie', '.popup', '.modal' ]; noiseSelectors.forEach(selector => { const elements = bodyClone.querySelectorAll(selector); elements.forEach(el => el.remove()); }); const text = bodyClone.textContent || ''; return text.replace(/\s+/g, ' ').trim(); } return ''; }); if (fallbackData && fallbackData.length > 200) { extractedData.content = fallbackData; } else { return { title: extractedData.title, content: extractedData.content || "", description: extractedData.description, success: false, error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`, }; } } // Limit content length to avoid token limits const maxLength = 50000; let content = extractedData.content; if (content.length > maxLength) { content = content.substring(0, maxLength) + "..."; } console.log(`Successfully extracted content: ${content.length} characters`); return { title: extractedData.title, content, description: extractedData.description, success: true, }; } catch (error) { console.error(`Content extraction failed for ${url}:`, error); return { title: "", content: "", description: "", success: false, error: error instanceof Error ? error.message : "Unknown error occurred", }; } finally { if (page) { await page.close(); } } } export async function enhanceArticleContent( _originalTitle: string, originalLink: string, originalContent?: string, originalDescription?: string, ): Promise<{ content?: string; description?: string }> { // If we already have substantial content, use it const existingContent = originalContent || originalDescription || ""; if (existingContent.length > 500) { return { content: originalContent, description: originalDescription, }; } // Try to extract content from the URL const extracted = await extractArticleContent(originalLink); if (extracted.success && extracted.content) { return { content: extracted.content, description: extracted.description || originalDescription, }; } // Return original content if extraction failed return { content: originalContent, description: originalDescription, }; }