From 99af4d11b16c1e43727bcbca278c93e3c33be3b5 Mon Sep 17 00:00:00 2001 From: Satsuki Akiba Date: Thu, 12 Jun 2025 13:09:38 +0900 Subject: [PATCH] Fix --- services/content-extractor.ts | 328 ++++++++++++++++++++++++++++++++++ 1 file changed, 328 insertions(+) diff --git a/services/content-extractor.ts b/services/content-extractor.ts index b01fcaa..09ee795 100644 --- a/services/content-extractor.ts +++ b/services/content-extractor.ts @@ -1,4 +1,6 @@ import puppeteer, { type Browser } from "puppeteer"; +import * as cheerio from "cheerio"; +import type { CheerioAPI } from "cheerio"; export interface ExtractedContent { title?: string; @@ -235,6 +237,300 @@ export async function closeBrowser(): Promise { } } +// Fallback content extraction using fetch + cheerio +async function extractWithFetchFallback(url: string): Promise { + console.log(`Using fetch fallback for: ${url}`); + + try { + const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"; + + const response = await fetch(url, { + headers: { + 'User-Agent': userAgent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Cache-Control': 'no-cache' + }, + signal: AbortSignal.timeout(30000) // 30 second timeout + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + const $ = cheerio.load(html); + + // Remove unwanted elements first + const unwantedSelectors = [ + "script", "style", "noscript", "iframe", "embed", "object", + "nav", "header", "footer", "aside", "form", + ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']", + ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb", + ".social-share", ".share", ".social", ".sns", + ".comments", ".comment", ".disqus", + ".cookie-banner", ".cookie", ".gdpr", + ".popup", ".modal", ".overlay", ".lightbox", + ".related", ".recommended", ".more-stories", + ".tags", ".categories", ".metadata", + ".author-bio", ".author-info", + ".newsletter", ".subscribe", ".signup", + "[role='complementary']", "[role='banner']", "[role='contentinfo']", + "[aria-label*='advertisement']", "[aria-label*='sidebar']" + ]; + + unwantedSelectors.forEach((selector) => { + $(selector).remove(); + }); + + // Extract title + let title = ""; + const titleSources = [ + $('meta[property="og:title"]').attr('content'), + $('meta[name="twitter:title"]').attr('content'), + $('h1').first().text().trim(), + $('.article-title, .post-title, .entry-title').first().text().trim(), + $('title').text().trim(), + $('[itemprop="headline"]').first().text().trim() + ]; + + for (const titleSource of titleSources) { + if (titleSource && titleSource.length > 0) { + title = titleSource; + break; + } + } + + // Extract description + let description = ""; + const descriptionSources = [ + $('meta[property="og:description"]').attr('content'), + $('meta[name="description"]').attr('content'), + $('meta[name="twitter:description"]').attr('content'), + $('[itemprop="description"]').first().text().trim() + ]; + + for (const descSource of descriptionSources) { + if (descSource && descSource.length > 0) { + description = descSource; + break; + } + } + + // Content selectors (same as in Puppeteer version) + const contentSelectors = [ + '[itemtype*="Article"] [itemprop="articleBody"]', + '[itemtype*="NewsArticle"] [itemprop="articleBody"]', + '[itemtype*="BlogPosting"] [itemprop="articleBody"]', + 'article[role="main"]', + 'main article', + '[role="main"] article', + 'article', + '.post-content', '.entry-content', '.article-content', '.content-area', + '.article-body', '.post-body', '.entry-body', '.story-body', + '.main-content', '.primary-content', '.page-content', + '.news-content', '.blog-content', '.editorial-content', + '.wp-content', '.entry', '.post', + '.section-content', '.postArticle-content', '.post-full-content', + '.markup', '.section--body', '.section-divider + .section-content', + '.honbun', '.main_text', '.article_body', '.news_body', + '.entry_text', '.blog_text', '.content_text', + '.kiji', '.news', '.article', + 'main', '[role="main"]', + '#content', '#main', '#article', '#post', '#entry', + '#main-content', '#primary', '#content-area', + '.content', '.main', '.wrapper', '.container' + ]; + + // Function to calculate content quality score + const calculateContentScore = (element: cheerio.Cheerio): number => { + const text = element.text() || ''; + if (text.length < 100) return 0; + + let score = 0; + + // Base score from text length (diminishing returns) + score += Math.min(text.length / 100, 50); + + // Paragraph density + const paragraphs = element.find('p'); + const avgParagraphLength = paragraphs.length > 0 ? + paragraphs.toArray().reduce((sum, p) => sum + ($(p).text().length || 0), 0) / paragraphs.length : 0; + + if (avgParagraphLength > 100) score += 20; + if (paragraphs.length > 3) score += 10; + + // Link density penalty + const links = element.find('a'); + const linkText = links.toArray().reduce((sum, link) => sum + ($(link).text().length || 0), 0); + const linkDensity = text.length > 0 ? linkText / text.length : 0; + if (linkDensity < 0.2) score += 15; + else if (linkDensity < 0.4) score += 5; + else score -= 10; + + // Bonus for article-like structure + if (element.prop('tagName') === 'ARTICLE') score += 25; + if (element.attr('role') === 'main') score += 20; + if (element.find('h1, h2, h3').length > 0) score += 10; + + // Bonus for semantic elements + const semanticElements = element.find('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol'); + if (semanticElements.length > 5) score += 15; + + // Penalty for navigation-like content + const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar']; + const className = (element.attr('class') || '').toLowerCase(); + const id = (element.attr('id') || '').toLowerCase(); + if (navWords.some(word => className.includes(word) || id.includes(word))) { + score -= 20; + } + + return Math.max(score, 0); + }; + + // Function to clean and normalize text + const cleanText = (text: string): string => { + return text + .replace(/\s+/g, ' ') + .replace(/\n\s*\n\s*\n/g, '\n\n') + .replace(/^\s+|\s+$/g, '') + .replace(/[\u200B-\u200D\uFEFF]/g, '') + .trim(); + }; + + // Collect and score all content candidates + interface ContentCandidate { + element: cheerio.Cheerio; + score: number; + content: string; + selector: string; + } + + const candidates: ContentCandidate[] = []; + + for (const selector of contentSelectors) { + try { + const elements = $(selector); + elements.each((index, element) => { + const $element = $(element); + const text = $element.text() || ''; + if (text.length > 200) { + const score = calculateContentScore($element); + candidates.push({ + element: $element, + score, + content: cleanText(text), + selector: `${selector}[${index}]` + }); + } + }); + } catch (e) { + continue; + } + } + + // Sort candidates by score (highest first) + candidates.sort((a, b) => b.score - a.score); + + console.log(`Found ${candidates.length} content candidates`); + if (candidates.length > 0) { + console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`); + } + + // Get the best content + let content = ""; + if (candidates.length > 0) { + content = candidates[0]!.content; + + // If the best candidate is still short, try combining top candidates + if (content.length < 500 && candidates.length > 1) { + const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10); + const combinedContent = topCandidates.map(c => c.content).join('\n\n'); + if (combinedContent.length > content.length) { + content = cleanText(combinedContent); + } + } + } + + // Fallback strategies if still no good content + if (!content || content.length < 200) { + console.log('Using paragraph aggregation fallback...'); + const paragraphs = $('p').toArray() + .map(p => $(p).text().trim()) + .filter(p => p.length > 50) + .filter(p => { + const lowerP = p.toLowerCase(); + return !lowerP.includes('cookie') && + !lowerP.includes('privacy') && + !lowerP.includes('terms of service') && + !lowerP.includes('subscribe') && + !lowerP.includes('newsletter'); + }); + + if (paragraphs.length > 0) { + content = cleanText(paragraphs.join('\n\n')); + } + } + + // Final fallback: structured data + if (!content || content.length < 200) { + console.log('Trying structured data fallback...'); + try { + const jsonLd = $('script[type="application/ld+json"]').first().html(); + if (jsonLd) { + const data = JSON.parse(jsonLd); + if (data.articleBody) { + content = cleanText(data.articleBody); + } else if (data.text) { + content = cleanText(data.text); + } + } + } catch (e) { + // Ignore JSON parsing errors + } + } + + // Limit content length to avoid token limits + const maxLength = 50000; + if (content.length > maxLength) { + content = content.substring(0, maxLength) + "..."; + } + + console.log(`Fetch fallback extracted content: ${content.length} characters`); + + if (!content || content.length < 100) { + return { + title: title || '', + content: '', + description: description || '', + success: false, + error: `Insufficient content extracted via fetch fallback (${content?.length || 0} characters)`, + }; + } + + return { + title: title || '', + content, + description: description || '', + success: true, + }; + + } catch (error) { + console.error(`Fetch fallback failed:`, error); + return { + title: '', + content: '', + description: '', + success: false, + error: error instanceof Error ? error.message : 'Unknown error in fetch fallback', + }; + } +} + async function extractWithRetry(url: string): Promise { const userAgents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", @@ -667,6 +963,36 @@ export async function extractArticleContent( } catch (error) { console.error(`Content extraction failed after all retries for ${url}:`, error); + // Check if this is a Puppeteer launch/browser failure that should trigger fallback + const shouldUseFallback = error instanceof Error && ( + error.message.includes('TimeoutError') || + error.message.includes('Timed out after') || + error.message.includes('waiting for the WS endpoint URL') || + error.message.includes('Browser closed') || + error.message.includes('Target closed') || + error.message.includes('Session closed') || + error.message.includes('Protocol error') || + error.message.includes('Connection terminated') || + error.message.includes('spawn') || // Process spawn errors + error.message.includes('ECONNRESET') || + error.message.includes('ECONNREFUSED') || + error.message.includes('ENOTFOUND') + ); + + if (shouldUseFallback) { + console.log(`Puppeteer failed, trying fetch fallback for ${url}`); + try { + const fallbackResult = await extractWithFetchFallback(url); + if (fallbackResult.success) { + console.log(`Fetch fallback succeeded for ${url}`); + return fallbackResult; + } + console.log(`Fetch fallback also failed for ${url}:`, fallbackResult.error); + } catch (fallbackError) { + console.error(`Fetch fallback threw error for ${url}:`, fallbackError); + } + } + // Provide more specific error messages let errorMessage = "Unknown error occurred"; if (error instanceof Error) { @@ -682,6 +1008,8 @@ export async function extractArticleContent( errorMessage = `Client error: ${error.message}`; } else if (error.message.includes('HTTP 5')) { errorMessage = `Server error: ${error.message}`; + } else if (error.message.includes('TimeoutError')) { + errorMessage = "Puppeteer browser launch timeout - both Puppeteer and fetch fallback failed"; } else { errorMessage = error.message; }