import * as cheerio from 'cheerio'; export interface ExtractedContent { title?: string; content: string; description?: string; success: boolean; error?: string; } export async function extractArticleContent(url: string): Promise { try { // Fetch the HTML content const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }, signal: AbortSignal.timeout(30000), // 30 second timeout }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = cheerio.load(html); // Remove unwanted elements $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove(); let content = ''; let title = ''; let description = ''; // Extract title title = $('title').text().trim() || $('h1').first().text().trim() || $('meta[property="og:title"]').attr('content') || ''; // Extract description description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || ''; // Try multiple content extraction strategies const contentSelectors = [ // Common article selectors 'article', '[role="main"]', '.article-content', '.post-content', '.entry-content', '.content', '.main-content', '.article-body', '.post-body', '.story-body', '.news-content', // Japanese news site specific selectors '.article', '.news-article', '.post', '.entry', '#content', '#main', '.main', // Fallback to common containers '.container', '#container', 'main', 'body' ]; for (const selector of contentSelectors) { const element = $(selector); if (element.length > 0) { // Get text content and clean it up let extractedText = element.text().trim(); // Remove extra whitespace and normalize extractedText = extractedText .replace(/\s+/g, ' ') .replace(/\n\s*\n/g, '\n') .trim(); // Only use if we found substantial content if (extractedText.length > 200) { content = extractedText; break; } } } // If still no content, try paragraph extraction if (!content) { const paragraphs = $('p').map((_, el) => $(el).text().trim()).get(); content = paragraphs .filter(p => p.length > 50) // Filter out short paragraphs .join('\n\n'); } // Final fallback: use body text if (!content || content.length < 100) { content = $('body').text() .replace(/\s+/g, ' ') .trim(); } // Validate extracted content if (!content || content.length < 50) { return { title, content: '', description, success: false, error: 'Insufficient content extracted' }; } // Limit content length to avoid token limits const maxLength = 5000; if (content.length > maxLength) { content = content.substring(0, maxLength) + '...'; } return { title, content, description, success: true }; } catch (error) { return { title: '', content: '', description: '', success: false, error: error instanceof Error ? error.message : 'Unknown error occurred' }; } } export async function enhanceArticleContent( originalTitle: string, originalLink: string, originalContent?: string, originalDescription?: string ): Promise<{ content?: string; description?: string }> { // If we already have substantial content, use it const existingContent = originalContent || originalDescription || ''; if (existingContent.length > 500) { return { content: originalContent, description: originalDescription }; } // Try to extract content from the URL const extracted = await extractArticleContent(originalLink); if (extracted.success && extracted.content) { return { content: extracted.content, description: extracted.description || originalDescription }; } // Return original content if extraction failed return { content: originalContent, description: originalDescription }; }