import puppeteer, { type Browser } from "puppeteer"; export interface ExtractedContent { title?: string; content: string; description?: string; success: boolean; error?: string; } // Singleton browser instance for reuse let sharedBrowser: Browser | null = null; async function getBrowser(): Promise { if (!sharedBrowser || !sharedBrowser.isConnected()) { sharedBrowser = await puppeteer.launch({ headless: true, args: [ "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--no-first-run", "--no-zygote", "--disable-gpu", "--disable-web-security", "--disable-features=VizDisplayCompositor", ], }); } return sharedBrowser; } export async function closeBrowser(): Promise { if (sharedBrowser && sharedBrowser.isConnected()) { await sharedBrowser.close(); sharedBrowser = null; } } export async function extractArticleContent( url: string, ): Promise { let page = null; try { const browser = await getBrowser(); page = await browser.newPage(); // Set user agent and viewport await page.setUserAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", ); await page.setViewport({ width: 1280, height: 720 }); // Set navigation timeout page.setDefaultNavigationTimeout(30000); page.setDefaultTimeout(30000); // Navigate to the page const response = await page.goto(url, { waitUntil: "networkidle2", timeout: 30000, }); if (!response || !response.ok()) { throw new Error(`HTTP ${response?.status()}: Failed to load page`); } // Wait for potential dynamic content await new Promise(resolve => setTimeout(resolve, 2000)); // Extract content using page.evaluate const extractedData = await page.evaluate(() => { // Remove unwanted elements const unwantedSelectors = [ "script", "style", "nav", "header", "footer", "aside", ".advertisement", ".ads", ".sidebar", ".menu", ".navigation", ".social-share", ".comments", ".cookie-banner", ".popup", ".modal", ]; unwantedSelectors.forEach((selector) => { const elements = document.querySelectorAll(selector); elements.forEach((el) => el.remove()); }); let content = ""; let title = ""; let description = ""; // Extract title const titleElement = document.querySelector("title"); const h1Element = document.querySelector("h1"); const ogTitleMeta = document.querySelector('meta[property="og:title"]'); title = titleElement?.textContent?.trim() || h1Element?.textContent?.trim() || ogTitleMeta?.getAttribute("content") || ""; // Extract description const descriptionMeta = document.querySelector('meta[name="description"]'); const ogDescriptionMeta = document.querySelector( 'meta[property="og:description"]', ); description = descriptionMeta?.getAttribute("content") || ogDescriptionMeta?.getAttribute("content") || ""; // Try multiple content extraction strategies const contentSelectors = [ // Common article selectors "article", '[role="main"]', ".article-content", ".post-content", ".entry-content", ".content", ".main-content", ".article-body", ".post-body", ".story-body", ".news-content", // Japanese news site specific selectors ".article", ".news-article", ".post", ".entry", "#content", "#main", ".main", // Fallback to common containers ".container", "#container", "main", "body", ]; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element) { // Get text content and clean it up let extractedText = element.textContent?.trim() || ""; // Remove extra whitespace and normalize extractedText = extractedText .replace(/\s+/g, " ") .replace(/\n\s*\n/g, "\n") .trim(); // Only use if we found substantial content if (extractedText.length > 200) { content = extractedText; break; } } } // If still no content, try paragraph extraction if (!content) { const paragraphs = Array.from(document.querySelectorAll("p")) .map((p) => p.textContent?.trim() || "") .filter((p) => p.length > 50); // Filter out short paragraphs content = paragraphs.join("\n\n"); } // Final fallback: use body text if (!content || content.length < 100) { const bodyText = document.body?.textContent || ""; content = bodyText.replace(/\s+/g, " ").trim(); } return { title, content, description }; }); // Validate extracted content if (!extractedData.content || extractedData.content.length < 50) { return { title: extractedData.title, content: "", description: extractedData.description, success: false, error: "Insufficient content extracted", }; } // Limit content length to avoid token limits const maxLength = 5000; let content = extractedData.content; if (content.length > maxLength) { content = content.substring(0, maxLength) + "..."; } return { title: extractedData.title, content, description: extractedData.description, success: true, }; } catch (error) { return { title: "", content: "", description: "", success: false, error: error instanceof Error ? error.message : "Unknown error occurred", }; } finally { if (page) { await page.close(); } } } export async function enhanceArticleContent( _originalTitle: string, originalLink: string, originalContent?: string, originalDescription?: string, ): Promise<{ content?: string; description?: string }> { // If we already have substantial content, use it const existingContent = originalContent || originalDescription || ""; if (existingContent.length > 500) { return { content: originalContent, description: originalDescription, }; } // Try to extract content from the URL const extracted = await extractArticleContent(originalLink); if (extracted.success && extracted.content) { return { content: extracted.content, description: extracted.description || originalDescription, }; } // Return original content if extraction failed return { content: originalContent, description: originalDescription, }; }