From 8ddd5ad10391465d575b855b13a6fd45ea818297 Mon Sep 17 00:00:00 2001 From: Satsuki Akiba Date: Thu, 12 Jun 2025 07:37:07 +0900 Subject: [PATCH] Update content extractor --- services/content-extractor.ts | 554 ++++++++++++++++++++++++++-------- 1 file changed, 435 insertions(+), 119 deletions(-) diff --git a/services/content-extractor.ts b/services/content-extractor.ts index 056b327..adea5e8 100644 --- a/services/content-extractor.ts +++ b/services/content-extractor.ts @@ -11,6 +11,122 @@ export interface ExtractedContent { // Singleton browser instance for reuse let sharedBrowser: Browser | null = null; +// Dynamic content handling function +async function handleDynamicContent(page: any): Promise { + try { + console.log('Starting dynamic content handling...'); + // Wait for initial content + await page.waitForSelector('body', { timeout: 5000 }); + + // Progressive loading strategy + const loadingStrategies = [ + // Strategy 1: Wait for common loading indicators to disappear + async () => { + const loadingSelectors = [ + '.loading', '.loader', '.spinner', '.skeleton', + '[class*="loading"]', '[class*="skeleton"]', + '.placeholder', '.shimmer' + ]; + + for (const selector of loadingSelectors) { + try { + await page.waitForSelector(selector, { timeout: 2000 }); + await page.waitForSelector(selector, { hidden: true, timeout: 10000 }); + break; + } catch (e) { + // Continue to next selector + } + } + }, + + // Strategy 2: Auto-scroll to trigger lazy loading + async () => { + await page.evaluate(() => { + return new Promise((resolve) => { + let totalHeight = 0; + const distance = 500; + const timer = setInterval(() => { + const scrollHeight = document.body.scrollHeight; + window.scrollBy(0, distance); + totalHeight += distance; + + if (totalHeight >= scrollHeight || totalHeight > 5000) { + clearInterval(timer); + window.scrollTo(0, 0); // Scroll back to top + setTimeout(() => resolve(), 1000); + } + }, 200); + }); + }); + }, + + // Strategy 3: Wait for content-specific indicators + async () => { + const contentSelectors = [ + 'article', '.article-content', '.post-content', '.entry-content', + 'main', '[role="main"]', '.main-content' + ]; + + for (const selector of contentSelectors) { + try { + await page.waitForSelector(selector, { timeout: 3000 }); + break; + } catch (e) { + // Continue to next selector + } + } + }, + + // Strategy 4: Handle "Read More" or expansion buttons + async () => { + const expandButtons = [ + 'button[class*="read-more"]', 'button[class*="expand"]', + '.read-more', '.show-more', '.expand-content', + 'a[class*="read-more"]', 'a[class*="continue"]' + ]; + + for (const selector of expandButtons) { + try { + const button = await page.$(selector); + if (button) { + await button.click(); + await page.waitForTimeout(2000); + break; + } + } catch (e) { + // Continue to next button + } + } + } + ]; + + // Execute strategies with timeouts + const executeWithTimeout = async (strategy: () => Promise, timeout: number) => { + return Promise.race([ + strategy(), + new Promise((resolve) => setTimeout(resolve, timeout)) + ]); + }; + + // Execute all strategies in parallel with timeouts + await Promise.allSettled([ + executeWithTimeout(loadingStrategies[0]!, 3000), + executeWithTimeout(loadingStrategies[1]!, 8000), + executeWithTimeout(loadingStrategies[2]!, 5000), + executeWithTimeout(loadingStrategies[3]!, 3000) + ]); + + // Final wait for any remaining dynamic content + await page.waitForTimeout(2000); + + } catch (error) { + console.log('Dynamic content handling failed, using basic timeout:', error); + // If dynamic content handling fails, continue with basic timeout + await page.waitForTimeout(3000); + } + console.log('Dynamic content handling completed.'); +} + async function getBrowser(): Promise { if (!sharedBrowser || !sharedBrowser.isConnected()) { sharedBrowser = await puppeteer.launch({ @@ -41,6 +157,7 @@ export async function closeBrowser(): Promise { export async function extractArticleContent( url: string, ): Promise { + console.log(`Starting content extraction for: ${url}`); let page = null; try { const browser = await getBrowser(); @@ -52,155 +169,352 @@ export async function extractArticleContent( ); await page.setViewport({ width: 1280, height: 720 }); - // Set navigation timeout - page.setDefaultNavigationTimeout(30000); - page.setDefaultTimeout(30000); + // Set navigation timeout and disable images for faster loading + page.setDefaultNavigationTimeout(45000); + page.setDefaultTimeout(45000); + + // Block unnecessary resources to speed up loading + await page.setRequestInterception(true); + page.on('request', (req) => { + const resourceType = req.resourceType(); + if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') { + req.abort(); + } else { + req.continue(); + } + }); - // Navigate to the page + // Navigate to the page with better waiting strategy const response = await page.goto(url, { - waitUntil: "networkidle2", - timeout: 30000, + waitUntil: "domcontentloaded", + timeout: 45000, }); if (!response || !response.ok()) { throw new Error(`HTTP ${response?.status()}: Failed to load page`); } - // Wait for potential dynamic content - await new Promise((resolve) => setTimeout(resolve, 2000)); + // Enhanced dynamic content handling + console.log('Handling dynamic content...'); + await handleDynamicContent(page); - // Extract content using page.evaluate + // Extract content using advanced multi-strategy approach + console.log('Extracting content using multi-strategy approach...'); const extractedData = await page.evaluate(() => { - // Remove unwanted elements + interface ContentCandidate { + element: Element; + score: number; + content: string; + selector: string; + } + + // Remove unwanted elements first const unwantedSelectors = [ - "script", - "style", - "nav", - "header", - "footer", - "aside", - ".advertisement", - ".ads", - ".sidebar", - ".menu", - ".navigation", - ".social-share", - ".comments", - ".cookie-banner", - ".popup", - ".modal", + "script", "style", "noscript", "iframe", "embed", "object", + "nav", "header", "footer", "aside", "form", + ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']", + ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb", + ".social-share", ".share", ".social", ".sns", + ".comments", ".comment", ".disqus", + ".cookie-banner", ".cookie", ".gdpr", + ".popup", ".modal", ".overlay", ".lightbox", + ".related", ".recommended", ".more-stories", + ".tags", ".categories", ".metadata", + ".author-bio", ".author-info", + ".newsletter", ".subscribe", ".signup", + "[role='complementary']", "[role='banner']", "[role='contentinfo']", + "[aria-label*='advertisement']", "[aria-label*='sidebar']" ]; unwantedSelectors.forEach((selector) => { - const elements = document.querySelectorAll(selector); - elements.forEach((el) => el.remove()); + try { + const elements = document.querySelectorAll(selector); + elements.forEach((el) => el.remove()); + } catch (e) { + // Ignore invalid selectors + } }); - let content = ""; - let title = ""; - let description = ""; - // Extract title - const titleElement = document.querySelector("title"); - const h1Element = document.querySelector("h1"); - const ogTitleMeta = document.querySelector('meta[property="og:title"]'); - - title = - titleElement?.textContent?.trim() || - h1Element?.textContent?.trim() || - ogTitleMeta?.getAttribute("content") || - ""; - - // Extract description - const descriptionMeta = document.querySelector( - 'meta[name="description"]', - ); - const ogDescriptionMeta = document.querySelector( - 'meta[property="og:description"]', - ); - - description = - descriptionMeta?.getAttribute("content") || - ogDescriptionMeta?.getAttribute("content") || - ""; - - // Try multiple content extraction strategies - const contentSelectors = [ - // Common article selectors - "article", - '[role="main"]', - ".article-content", - ".post-content", - ".entry-content", - ".content", - ".main-content", - ".article-body", - ".post-body", - ".story-body", - ".news-content", - - // Japanese news site specific selectors - ".article", - ".news-article", - ".post", - ".entry", - "#content", - "#main", - ".main", - - // Fallback to common containers - ".container", - "#container", - "main", - "body", + let title = ""; + const titleSources = [ + () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'), + () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'), + () => document.querySelector('h1')?.textContent?.trim(), + () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(), + () => document.querySelector('title')?.textContent?.trim(), + () => document.querySelector('[itemprop="headline"]')?.textContent?.trim() ]; - for (const selector of contentSelectors) { - const element = document.querySelector(selector); - if (element) { - // Get text content and clean it up - let extractedText = element.textContent?.trim() || ""; - - // Remove extra whitespace and normalize - extractedText = extractedText - .replace(/\s+/g, " ") - .replace(/\n\s*\n/g, "\n") - .trim(); - - // Only use if we found substantial content - if (extractedText.length > 200) { - content = extractedText; + for (const source of titleSources) { + try { + const result = source(); + if (result && result.length > 0) { + title = result; break; } + } catch (e) { + continue; } } - // If still no content, try paragraph extraction - if (!content) { - const paragraphs = Array.from(document.querySelectorAll("p")) - .map((p) => p.textContent?.trim() || "") - .filter((p) => p.length > 50); // Filter out short paragraphs - content = paragraphs.join("\n\n"); + // Extract description + let description = ""; + const descriptionSources = [ + () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'), + () => document.querySelector('meta[name="description"]')?.getAttribute('content'), + () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'), + () => document.querySelector('[itemprop="description"]')?.textContent?.trim() + ]; + + for (const source of descriptionSources) { + try { + const result = source(); + if (result && result.length > 0) { + description = result; + break; + } + } catch (e) { + continue; + } } - // Final fallback: use body text - if (!content || content.length < 100) { - const bodyText = document.body?.textContent || ""; - content = bodyText.replace(/\s+/g, " ").trim(); - } + // Comprehensive content selectors with priorities + const contentSelectors = [ + // Schema.org and structured data + '[itemtype*="Article"] [itemprop="articleBody"]', + '[itemtype*="NewsArticle"] [itemprop="articleBody"]', + '[itemtype*="BlogPosting"] [itemprop="articleBody"]', + + // High-priority semantic selectors + 'article[role="main"]', + 'main article', + '[role="main"] article', + 'article', + + // Common CMS and platform selectors + '.post-content', '.entry-content', '.article-content', '.content-area', + '.article-body', '.post-body', '.entry-body', '.story-body', + '.main-content', '.primary-content', '.page-content', + '.news-content', '.blog-content', '.editorial-content', + + // WordPress specific + '.wp-content', '.entry', '.post', + + // Medium, Substack, Ghost + '.section-content', '.postArticle-content', '.post-full-content', + '.markup', '.section--body', '.section-divider + .section-content', + + // Japanese sites specific + '.honbun', '.main_text', '.article_body', '.news_body', + '.entry_text', '.blog_text', '.content_text', + '.kiji', '.news', '.article', + + // Generic semantic HTML5 + 'main', '[role="main"]', + + // ID-based selectors + '#content', '#main', '#article', '#post', '#entry', + '#main-content', '#primary', '#content-area', + + // Class-based common patterns + '.content', '.main', '.wrapper', '.container', + + // Fallbacks + 'body' + ]; + // Function to calculate content quality score + function calculateContentScore(element: Element): number { + if (!element) return 0; + + const text = element.textContent || ''; + if (text.length < 100) return 0; + + let score = 0; + + // Base score from text length (diminishing returns) + score += Math.min(text.length / 100, 50); + + // Paragraph density + const paragraphs = element.querySelectorAll('p'); + const avgParagraphLength = paragraphs.length > 0 ? + Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0; + + if (avgParagraphLength > 100) score += 20; + if (paragraphs.length > 3) score += 10; + + // Link density penalty (articles shouldn't be mostly links) + const links = element.querySelectorAll('a'); + const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0); + const linkDensity = text.length > 0 ? linkText / text.length : 0; + if (linkDensity < 0.2) score += 15; + else if (linkDensity < 0.4) score += 5; + else score -= 10; + + // Bonus for article-like structure + if (element.tagName === 'ARTICLE') score += 25; + if (element.getAttribute('role') === 'main') score += 20; + if (element.querySelector('h1, h2, h3')) score += 10; + + // Bonus for semantic elements + const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol'); + if (semanticElements.length > 5) score += 15; + + // Penalty for too many images without text + const images = element.querySelectorAll('img'); + if (images.length > text.length / 500) score -= 5; + + // Penalty for navigation-like content + const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar']; + const className = element.className.toLowerCase(); + const id = element.id.toLowerCase(); + if (navWords.some(word => className.includes(word) || id.includes(word))) { + score -= 20; + } + + return Math.max(score, 0); + } + + // Function to clean and normalize text + function cleanText(text: string): string { + return text + .replace(/\s+/g, ' ') // Normalize whitespace + .replace(/\n\s*\n\s*\n/g, '\n\n') // Reduce excessive line breaks + .replace(/^\s+|\s+$/g, '') // Trim + .replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width characters + .trim(); + } + + // Collect and score all content candidates + const candidates: ContentCandidate[] = []; + + for (const selector of contentSelectors) { + try { + const elements = document.querySelectorAll(selector); + elements.forEach((element, index) => { + const text = element.textContent || ''; + if (text.length > 200) { // Minimum content threshold + const score = calculateContentScore(element); + candidates.push({ + element, + score, + content: cleanText(text), + selector: `${selector}[${index}]` + }); + } + }); + } catch (e) { + // Skip invalid selectors + continue; + } + } + + // Sort candidates by score (highest first) + candidates.sort((a, b) => b.score - a.score); + + console.log(`Found ${candidates.length} content candidates`); + if (candidates.length > 0) { + console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`); + } + + // Get the best content + let content = ""; + if (candidates.length > 0) { + content = candidates[0].content; + + // If the best candidate is still short, try combining top candidates + if (content.length < 500 && candidates.length > 1) { + const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10); + const combinedContent = topCandidates.map(c => c.content).join('\n\n'); + if (combinedContent.length > content.length) { + content = cleanText(combinedContent); + } + } + } + + // Fallback strategies if still no good content + if (!content || content.length < 200) { + // Try paragraph aggregation + const paragraphs = Array.from(document.querySelectorAll('p')) + .map(p => p.textContent?.trim() || '') + .filter(p => p.length > 50) + .filter(p => { + // Filter out likely navigation/boilerplate paragraphs + const lowerP = p.toLowerCase(); + return !lowerP.includes('cookie') && + !lowerP.includes('privacy') && + !lowerP.includes('terms of service') && + !lowerP.includes('subscribe') && + !lowerP.includes('newsletter'); + }); + + if (paragraphs.length > 0) { + content = cleanText(paragraphs.join('\n\n')); + } + } + + // Final fallback: structured data + if (!content || content.length < 200) { + try { + const jsonLd = document.querySelector('script[type="application/ld+json"]'); + if (jsonLd) { + const data = JSON.parse(jsonLd.textContent || '{}'); + if (data.articleBody) { + content = cleanText(data.articleBody); + } else if (data.text) { + content = cleanText(data.text); + } + } + } catch (e) { + // Ignore JSON parsing errors + } + } + + console.log(`Final content length: ${content.length} characters`); return { title, content, description }; }); - // Validate extracted content - if (!extractedData.content || extractedData.content.length < 50) { - return { - title: extractedData.title, - content: "", - description: extractedData.description, - success: false, - error: "Insufficient content extracted", - }; + // Validate extracted content with more lenient threshold + if (!extractedData.content || extractedData.content.length < 100) { + // Try one more extraction attempt with relaxed criteria + const fallbackData = await page.evaluate(() => { + // Last resort: extract all text from body, excluding common noise + const body = document.body; + if (body) { + // Clone body to avoid modifying original + const bodyClone = body.cloneNode(true) as Element; + + // Remove noise elements from clone + const noiseSelectors = [ + 'script', 'style', 'nav', 'header', 'footer', 'aside', + '.ad', '.ads', '.advertisement', '[class*="ad-"]', + '.menu', '.navigation', '.sidebar', '.social', + '.cookie', '.popup', '.modal' + ]; + + noiseSelectors.forEach(selector => { + const elements = bodyClone.querySelectorAll(selector); + elements.forEach(el => el.remove()); + }); + + const text = bodyClone.textContent || ''; + return text.replace(/\s+/g, ' ').trim(); + } + return ''; + }); + + if (fallbackData && fallbackData.length > 200) { + extractedData.content = fallbackData; + } else { + return { + title: extractedData.title, + content: extractedData.content || "", + description: extractedData.description, + success: false, + error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`, + }; + } } // Limit content length to avoid token limits @@ -210,6 +524,7 @@ export async function extractArticleContent( content = content.substring(0, maxLength) + "..."; } + console.log(`Successfully extracted content: ${content.length} characters`); return { title: extractedData.title, content, @@ -217,6 +532,7 @@ export async function extractArticleContent( success: true, }; } catch (error) { + console.error(`Content extraction failed for ${url}:`, error); return { title: "", content: "",