import * as cheerio from "cheerio"; export interface ExtractedContent { title?: string; content: string; description?: string; success: boolean; error?: string; } export async function extractArticleContent( url: string, ): Promise { try { // Fetch the HTML content const response = await fetch(url, { headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "ja,en-US;q=0.7,en;q=0.3", "Accept-Encoding": "gzip, deflate", Connection: "keep-alive", "Upgrade-Insecure-Requests": "1", }, signal: AbortSignal.timeout(30000), // 30 second timeout }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = cheerio.load(html); // Remove unwanted elements $( "script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments", ).remove(); let content = ""; let title = ""; let description = ""; // Extract title title = $("title").text().trim() || $("h1").first().text().trim() || $('meta[property="og:title"]').attr("content") || ""; // Extract description description = $('meta[name="description"]').attr("content") || $('meta[property="og:description"]').attr("content") || ""; // Try multiple content extraction strategies const contentSelectors = [ // Common article selectors "article", '[role="main"]', ".article-content", ".post-content", ".entry-content", ".content", ".main-content", ".article-body", ".post-body", ".story-body", ".news-content", // Japanese news site specific selectors ".article", ".news-article", ".post", ".entry", "#content", "#main", ".main", // Fallback to common containers ".container", "#container", "main", "body", ]; for (const selector of contentSelectors) { const element = $(selector); if (element.length > 0) { // Get text content and clean it up let extractedText = element.text().trim(); // Remove extra whitespace and normalize extractedText = extractedText .replace(/\s+/g, " ") .replace(/\n\s*\n/g, "\n") .trim(); // Only use if we found substantial content if (extractedText.length > 200) { content = extractedText; break; } } } // If still no content, try paragraph extraction if (!content) { const paragraphs = $("p") .map((_, el) => $(el).text().trim()) .get(); content = paragraphs .filter((p) => p.length > 50) // Filter out short paragraphs .join("\n\n"); } // Final fallback: use body text if (!content || content.length < 100) { content = $("body").text().replace(/\s+/g, " ").trim(); } // Validate extracted content if (!content || content.length < 50) { return { title, content: "", description, success: false, error: "Insufficient content extracted", }; } // Limit content length to avoid token limits const maxLength = 5000; if (content.length > maxLength) { content = content.substring(0, maxLength) + "..."; } return { title, content, description, success: true, }; } catch (error) { return { title: "", content: "", description: "", success: false, error: error instanceof Error ? error.message : "Unknown error occurred", }; } } export async function enhanceArticleContent( originalTitle: string, originalLink: string, originalContent?: string, originalDescription?: string, ): Promise<{ content?: string; description?: string }> { // If we already have substantial content, use it const existingContent = originalContent || originalDescription || ""; if (existingContent.length > 500) { return { content: originalContent, description: originalDescription, }; } // Try to extract content from the URL const extracted = await extractArticleContent(originalLink); if (extracted.success && extracted.content) { return { content: extracted.content, description: extracted.description || originalDescription, }; } // Return original content if extraction failed return { content: originalContent, description: originalDescription, }; }