Update content extractor

2025-06-12 07:37:07 +09:00
parent f808c5e7ea
commit 8ddd5ad103
1 changed files with 435 additions and 119 deletions
--- a/services/content-extractor.ts
+++ b/services/content-extractor.ts
@@ -11,6 +11,122 @@ export interface ExtractedContent {
 // Singleton browser instance for reuse
 let sharedBrowser: Browser | null = null;

+// Dynamic content handling function
+async function handleDynamicContent(page: any): Promise<void> {
+  try {
+    console.log('Starting dynamic content handling...');
+    // Wait for initial content
+    await page.waitForSelector('body', { timeout: 5000 });
+    
+    // Progressive loading strategy
+    const loadingStrategies = [
+      // Strategy 1: Wait for common loading indicators to disappear
+      async () => {
+        const loadingSelectors = [
+          '.loading', '.loader', '.spinner', '.skeleton',
+          '[class*="loading"]', '[class*="skeleton"]',
+          '.placeholder', '.shimmer'
+        ];
+        
+        for (const selector of loadingSelectors) {
+          try {
+            await page.waitForSelector(selector, { timeout: 2000 });
+            await page.waitForSelector(selector, { hidden: true, timeout: 10000 });
+            break;
+          } catch (e) {
+            // Continue to next selector
+          }
+        }
+      },
+      
+      // Strategy 2: Auto-scroll to trigger lazy loading
+      async () => {
+        await page.evaluate(() => {
+          return new Promise<void>((resolve) => {
+            let totalHeight = 0;
+            const distance = 500;
+            const timer = setInterval(() => {
+              const scrollHeight = document.body.scrollHeight;
+              window.scrollBy(0, distance);
+              totalHeight += distance;
+              
+              if (totalHeight >= scrollHeight || totalHeight > 5000) {
+                clearInterval(timer);
+                window.scrollTo(0, 0); // Scroll back to top
+                setTimeout(() => resolve(), 1000);
+              }
+            }, 200);
+          });
+        });
+      },
+      
+      // Strategy 3: Wait for content-specific indicators
+      async () => {
+        const contentSelectors = [
+          'article', '.article-content', '.post-content', '.entry-content',
+          'main', '[role="main"]', '.main-content'
+        ];
+        
+        for (const selector of contentSelectors) {
+          try {
+            await page.waitForSelector(selector, { timeout: 3000 });
+            break;
+          } catch (e) {
+            // Continue to next selector
+          }
+        }
+      },
+      
+      // Strategy 4: Handle "Read More" or expansion buttons
+      async () => {
+        const expandButtons = [
+          'button[class*="read-more"]', 'button[class*="expand"]',
+          '.read-more', '.show-more', '.expand-content',
+          'a[class*="read-more"]', 'a[class*="continue"]'
+        ];
+        
+        for (const selector of expandButtons) {
+          try {
+            const button = await page.$(selector);
+            if (button) {
+              await button.click();
+              await page.waitForTimeout(2000);
+              break;
+            }
+          } catch (e) {
+            // Continue to next button
+          }
+        }
+      }
+    ];
+    
+    // Execute strategies with timeouts
+    const executeWithTimeout = async (strategy: () => Promise<void>, timeout: number) => {
+      return Promise.race([
+        strategy(),
+        new Promise<void>((resolve) => setTimeout(resolve, timeout))
+      ]);
+    };
+    
+    // Execute all strategies in parallel with timeouts
+    await Promise.allSettled([
+      executeWithTimeout(loadingStrategies[0]!, 3000),
+      executeWithTimeout(loadingStrategies[1]!, 8000),
+      executeWithTimeout(loadingStrategies[2]!, 5000),
+      executeWithTimeout(loadingStrategies[3]!, 3000)
+    ]);
+    
+    // Final wait for any remaining dynamic content
+    await page.waitForTimeout(2000);
+    
+  } catch (error) {
+    console.log('Dynamic content handling failed, using basic timeout:', error);
+    // If dynamic content handling fails, continue with basic timeout
+    await page.waitForTimeout(3000);
+  }
+  console.log('Dynamic content handling completed.');
+}
+
 async function getBrowser(): Promise<Browser> {
  if (!sharedBrowser || !sharedBrowser.isConnected()) {
    sharedBrowser = await puppeteer.launch({
@@ -41,6 +157,7 @@ export async function closeBrowser(): Promise<void> {
 export async function extractArticleContent(
  url: string,
 ): Promise<ExtractedContent> {
+  console.log(`Starting content extraction for: ${url}`);
  let page = null;
  try {
    const browser = await getBrowser();
@@ -52,155 +169,352 @@ export async function extractArticleContent(
    );
    await page.setViewport({ width: 1280, height: 720 });

-    // Set navigation timeout
-    page.setDefaultNavigationTimeout(30000);
-    page.setDefaultTimeout(30000);
+    // Set navigation timeout and disable images for faster loading
+    page.setDefaultNavigationTimeout(45000);
+    page.setDefaultTimeout(45000);
    
-    // Navigate to the page
+    // Block unnecessary resources to speed up loading
+    await page.setRequestInterception(true);
+    page.on('request', (req) => {
+      const resourceType = req.resourceType();
+      if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
+        req.abort();
+      } else {
+        req.continue();
+      }
+    });
+
+    // Navigate to the page with better waiting strategy
    const response = await page.goto(url, {
-      waitUntil: "networkidle2",
-      timeout: 30000,
+      waitUntil: "domcontentloaded",
+      timeout: 45000,
    });

    if (!response || !response.ok()) {
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
    }

-    // Wait for potential dynamic content
-    await new Promise((resolve) => setTimeout(resolve, 2000));
+    // Enhanced dynamic content handling
+    console.log('Handling dynamic content...');
+    await handleDynamicContent(page);

-    // Extract content using page.evaluate
+    // Extract content using advanced multi-strategy approach
+    console.log('Extracting content using multi-strategy approach...');
    const extractedData = await page.evaluate(() => {
-      // Remove unwanted elements
+      interface ContentCandidate {
+        element: Element;
+        score: number;
+        content: string;
+        selector: string;
+      }
+
+      // Remove unwanted elements first
      const unwantedSelectors = [
-        "script",
-        "style",
-        "nav",
-        "header",
-        "footer",
-        "aside",
-        ".advertisement",
-        ".ads",
-        ".sidebar",
-        ".menu",
-        ".navigation",
-        ".social-share",
-        ".comments",
-        ".cookie-banner",
-        ".popup",
-        ".modal",
+        "script", "style", "noscript", "iframe", "embed", "object",
+        "nav", "header", "footer", "aside", "form",
+        ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
+        ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
+        ".social-share", ".share", ".social", ".sns",
+        ".comments", ".comment", ".disqus",
+        ".cookie-banner", ".cookie", ".gdpr",
+        ".popup", ".modal", ".overlay", ".lightbox",
+        ".related", ".recommended", ".more-stories",
+        ".tags", ".categories", ".metadata",
+        ".author-bio", ".author-info",
+        ".newsletter", ".subscribe", ".signup",
+        "[role='complementary']", "[role='banner']", "[role='contentinfo']",
+        "[aria-label*='advertisement']", "[aria-label*='sidebar']"
      ];

      unwantedSelectors.forEach((selector) => {
-        const elements = document.querySelectorAll(selector);
-        elements.forEach((el) => el.remove());
+        try {
+          const elements = document.querySelectorAll(selector);
+          elements.forEach((el) => el.remove());
+        } catch (e) {
+          // Ignore invalid selectors
+        }
      });

-      let content = "";
-      let title = "";
-      let description = "";
-
      // Extract title
-      const titleElement = document.querySelector("title");
-      const h1Element = document.querySelector("h1");
-      const ogTitleMeta = document.querySelector('meta[property="og:title"]');
-
-      title =
-        titleElement?.textContent?.trim() ||
-        h1Element?.textContent?.trim() ||
-        ogTitleMeta?.getAttribute("content") ||
-        "";
-
-      // Extract description
-      const descriptionMeta = document.querySelector(
-        'meta[name="description"]',
-      );
-      const ogDescriptionMeta = document.querySelector(
-        'meta[property="og:description"]',
-      );
-
-      description =
-        descriptionMeta?.getAttribute("content") ||
-        ogDescriptionMeta?.getAttribute("content") ||
-        "";
-
-      // Try multiple content extraction strategies
-      const contentSelectors = [
-        // Common article selectors
-        "article",
-        '[role="main"]',
-        ".article-content",
-        ".post-content",
-        ".entry-content",
-        ".content",
-        ".main-content",
-        ".article-body",
-        ".post-body",
-        ".story-body",
-        ".news-content",
-
-        // Japanese news site specific selectors
-        ".article",
-        ".news-article",
-        ".post",
-        ".entry",
-        "#content",
-        "#main",
-        ".main",
-
-        // Fallback to common containers
-        ".container",
-        "#container",
-        "main",
-        "body",
+      let title = "";
+      const titleSources = [
+        () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'),
+        () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'),
+        () => document.querySelector('h1')?.textContent?.trim(),
+        () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(),
+        () => document.querySelector('title')?.textContent?.trim(),
+        () => document.querySelector('[itemprop="headline"]')?.textContent?.trim()
      ];

-      for (const selector of contentSelectors) {
-        const element = document.querySelector(selector);
-        if (element) {
-          // Get text content and clean it up
-          let extractedText = element.textContent?.trim() || "";
-
-          // Remove extra whitespace and normalize
-          extractedText = extractedText
-            .replace(/\s+/g, " ")
-            .replace(/\n\s*\n/g, "\n")
-            .trim();
-
-          // Only use if we found substantial content
-          if (extractedText.length > 200) {
-            content = extractedText;
+      for (const source of titleSources) {
+        try {
+          const result = source();
+          if (result && result.length > 0) {
+            title = result;
            break;
          }
+        } catch (e) {
+          continue;
+        }
+      }
+
+      // Extract description
+      let description = "";
+      const descriptionSources = [
+        () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'),
+        () => document.querySelector('meta[name="description"]')?.getAttribute('content'),
+        () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'),
+        () => document.querySelector('[itemprop="description"]')?.textContent?.trim()
+      ];
+
+      for (const source of descriptionSources) {
+        try {
+          const result = source();
+          if (result && result.length > 0) {
+            description = result;
+            break;
+          }
+        } catch (e) {
+          continue;
+        }
+      }
+
+      // Comprehensive content selectors with priorities
+      const contentSelectors = [
+        // Schema.org and structured data
+        '[itemtype*="Article"] [itemprop="articleBody"]',
+        '[itemtype*="NewsArticle"] [itemprop="articleBody"]',
+        '[itemtype*="BlogPosting"] [itemprop="articleBody"]',
+        
+        // High-priority semantic selectors
+        'article[role="main"]',
+        'main article',
+        '[role="main"] article',
+        'article',
+        
+        // Common CMS and platform selectors
+        '.post-content', '.entry-content', '.article-content', '.content-area',
+        '.article-body', '.post-body', '.entry-body', '.story-body',
+        '.main-content', '.primary-content', '.page-content',
+        '.news-content', '.blog-content', '.editorial-content',
+        
+        // WordPress specific
+        '.wp-content', '.entry', '.post',
+        
+        // Medium, Substack, Ghost
+        '.section-content', '.postArticle-content', '.post-full-content',
+        '.markup', '.section--body', '.section-divider + .section-content',
+        
+        // Japanese sites specific
+        '.honbun', '.main_text', '.article_body', '.news_body',
+        '.entry_text', '.blog_text', '.content_text',
+        '.kiji', '.news', '.article',
+        
+        // Generic semantic HTML5
+        'main', '[role="main"]',
+        
+        // ID-based selectors
+        '#content', '#main', '#article', '#post', '#entry',
+        '#main-content', '#primary', '#content-area',
+        
+        // Class-based common patterns
+        '.content', '.main', '.wrapper', '.container',
+        
+        // Fallbacks
+        'body'
+      ];
+
+      // Function to calculate content quality score
+      function calculateContentScore(element: Element): number {
+        if (!element) return 0;
+        
+        const text = element.textContent || '';
+        if (text.length < 100) return 0;
+        
+        let score = 0;
+        
+        // Base score from text length (diminishing returns)
+        score += Math.min(text.length / 100, 50);
+        
+        // Paragraph density
+        const paragraphs = element.querySelectorAll('p');
+        const avgParagraphLength = paragraphs.length > 0 ? 
+          Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0;
+        
+        if (avgParagraphLength > 100) score += 20;
+        if (paragraphs.length > 3) score += 10;
+        
+        // Link density penalty (articles shouldn't be mostly links)
+        const links = element.querySelectorAll('a');
+        const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
+        const linkDensity = text.length > 0 ? linkText / text.length : 0;
+        if (linkDensity < 0.2) score += 15;
+        else if (linkDensity < 0.4) score += 5;
+        else score -= 10;
+        
+        // Bonus for article-like structure
+        if (element.tagName === 'ARTICLE') score += 25;
+        if (element.getAttribute('role') === 'main') score += 20;
+        if (element.querySelector('h1, h2, h3')) score += 10;
+        
+        // Bonus for semantic elements
+        const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
+        if (semanticElements.length > 5) score += 15;
+        
+        // Penalty for too many images without text
+        const images = element.querySelectorAll('img');
+        if (images.length > text.length / 500) score -= 5;
+        
+        // Penalty for navigation-like content
+        const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
+        const className = element.className.toLowerCase();
+        const id = element.id.toLowerCase();
+        if (navWords.some(word => className.includes(word) || id.includes(word))) {
+          score -= 20;
+        }
+        
+        return Math.max(score, 0);
+      }
+      
+      // Function to clean and normalize text
+      function cleanText(text: string): string {
+        return text
+          .replace(/\s+/g, ' ')  // Normalize whitespace
+          .replace(/\n\s*\n\s*\n/g, '\n\n')  // Reduce excessive line breaks
+          .replace(/^\s+|\s+$/g, '')  // Trim
+          .replace(/[\u200B-\u200D\uFEFF]/g, '')  // Remove zero-width characters
+          .trim();
+      }
+      
+      // Collect and score all content candidates
+      const candidates: ContentCandidate[] = [];
+      
+      for (const selector of contentSelectors) {
+        try {
+          const elements = document.querySelectorAll(selector);
+          elements.forEach((element, index) => {
+            const text = element.textContent || '';
+            if (text.length > 200) {  // Minimum content threshold
+              const score = calculateContentScore(element);
+              candidates.push({
+                element,
+                score,
+                content: cleanText(text),
+                selector: `${selector}[${index}]`
+              });
+            }
+          });
+        } catch (e) {
+          // Skip invalid selectors
+          continue;
+        }
+      }
+      
+      // Sort candidates by score (highest first)
+      candidates.sort((a, b) => b.score - a.score);
+      
+      console.log(`Found ${candidates.length} content candidates`);
+      if (candidates.length > 0) {
+        console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
+      }
+      
+      // Get the best content
+      let content = "";
+      if (candidates.length > 0) {
+        content = candidates[0].content;
+        
+        // If the best candidate is still short, try combining top candidates
+        if (content.length < 500 && candidates.length > 1) {
+          const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
+          const combinedContent = topCandidates.map(c => c.content).join('\n\n');
+          if (combinedContent.length > content.length) {
+            content = cleanText(combinedContent);
+          }
        }
      }
      
-      // If still no content, try paragraph extraction
-      if (!content) {
-        const paragraphs = Array.from(document.querySelectorAll("p"))
-          .map((p) => p.textContent?.trim() || "")
-          .filter((p) => p.length > 50); // Filter out short paragraphs
-        content = paragraphs.join("\n\n");
+      // Fallback strategies if still no good content
+      if (!content || content.length < 200) {
+        // Try paragraph aggregation
+        const paragraphs = Array.from(document.querySelectorAll('p'))
+          .map(p => p.textContent?.trim() || '')
+          .filter(p => p.length > 50)
+          .filter(p => {
+            // Filter out likely navigation/boilerplate paragraphs
+            const lowerP = p.toLowerCase();
+            return !lowerP.includes('cookie') && 
+                   !lowerP.includes('privacy') && 
+                   !lowerP.includes('terms of service') &&
+                   !lowerP.includes('subscribe') &&
+                   !lowerP.includes('newsletter');
+          });
+          
+        if (paragraphs.length > 0) {
+          content = cleanText(paragraphs.join('\n\n'));
+        }
      }
      
-      // Final fallback: use body text
-      if (!content || content.length < 100) {
-        const bodyText = document.body?.textContent || "";
-        content = bodyText.replace(/\s+/g, " ").trim();
+      // Final fallback: structured data
+      if (!content || content.length < 200) {
+        try {
+          const jsonLd = document.querySelector('script[type="application/ld+json"]');
+          if (jsonLd) {
+            const data = JSON.parse(jsonLd.textContent || '{}');
+            if (data.articleBody) {
+              content = cleanText(data.articleBody);
+            } else if (data.text) {
+              content = cleanText(data.text);
+            }
+          }
+        } catch (e) {
+          // Ignore JSON parsing errors
+        }
      }
      
+      console.log(`Final content length: ${content.length} characters`);
      return { title, content, description };
    });

-    // Validate extracted content
-    if (!extractedData.content || extractedData.content.length < 50) {
-      return {
-        title: extractedData.title,
-        content: "",
-        description: extractedData.description,
-        success: false,
-        error: "Insufficient content extracted",
-      };
+    // Validate extracted content with more lenient threshold
+    if (!extractedData.content || extractedData.content.length < 100) {
+      // Try one more extraction attempt with relaxed criteria
+      const fallbackData = await page.evaluate(() => {
+        // Last resort: extract all text from body, excluding common noise
+        const body = document.body;
+        if (body) {
+          // Clone body to avoid modifying original
+          const bodyClone = body.cloneNode(true) as Element;
+          
+          // Remove noise elements from clone
+          const noiseSelectors = [
+            'script', 'style', 'nav', 'header', 'footer', 'aside',
+            '.ad', '.ads', '.advertisement', '[class*="ad-"]',
+            '.menu', '.navigation', '.sidebar', '.social',
+            '.cookie', '.popup', '.modal'
+          ];
+          
+          noiseSelectors.forEach(selector => {
+            const elements = bodyClone.querySelectorAll(selector);
+            elements.forEach(el => el.remove());
+          });
+          
+          const text = bodyClone.textContent || '';
+          return text.replace(/\s+/g, ' ').trim();
+        }
+        return '';
+      });
+      
+      if (fallbackData && fallbackData.length > 200) {
+        extractedData.content = fallbackData;
+      } else {
+        return {
+          title: extractedData.title,
+          content: extractedData.content || "",
+          description: extractedData.description,
+          success: false,
+          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
+        };
+      }
    }

    // Limit content length to avoid token limits
@@ -210,6 +524,7 @@ export async function extractArticleContent(
      content = content.substring(0, maxLength) + "...";
    }

+    console.log(`Successfully extracted content: ${content.length} characters`);
    return {
      title: extractedData.title,
      content,
@@ -217,6 +532,7 @@ export async function extractArticleContent(
      success: true,
    };
  } catch (error) {
+    console.error(`Content extraction failed for ${url}:`, error);
    return {
      title: "",
      content: "",