VoiceRSSSummary/services/content-extractor.ts

import puppeteer, { type Browser } from "puppeteer";

export interface ExtractedContent {
  title?: string;
  content: string;
  description?: string;
  success: boolean;
  error?: string;
}

// Singleton browser instance for reuse
let sharedBrowser: Browser | null = null;

// Helper function to replace page.waitForTimeout
async function waitForTimeout(ms: number): Promise<void> {
  return new Promise(resolve => setTimeout(resolve, ms));
}

// Dynamic content handling function
async function handleDynamicContent(page: any): Promise<void> {
  try {
    console.log('Starting dynamic content handling...');
    // Wait for initial content
    await page.waitForSelector('body', { timeout: 5000 });

    // Progressive loading strategy
    const loadingStrategies = [
      // Strategy 1: Wait for common loading indicators to disappear
      async () => {
        const loadingSelectors = [
          '.loading', '.loader', '.spinner', '.skeleton',
          '[class*="loading"]', '[class*="skeleton"]',
          '.placeholder', '.shimmer'
        ];

        for (const selector of loadingSelectors) {
          try {
            await page.waitForSelector(selector, { timeout: 2000 });
            await page.waitForSelector(selector, { hidden: true, timeout: 10000 });
            break;
          } catch (e) {
            // Continue to next selector
          }
        }
      },

      // Strategy 2: Auto-scroll to trigger lazy loading
      async () => {
        await page.evaluate(() => {
          return new Promise<void>((resolve) => {
            let totalHeight = 0;
            const distance = 500;
            const timer = setInterval(() => {
              const scrollHeight = document.body.scrollHeight;
              window.scrollBy(0, distance);
              totalHeight += distance;

              if (totalHeight >= scrollHeight || totalHeight > 5000) {
                clearInterval(timer);
                window.scrollTo(0, 0); // Scroll back to top
                setTimeout(() => resolve(), 1000);
              }
            }, 200);
          });
        });
      },

      // Strategy 3: Wait for content-specific indicators
      async () => {
        const contentSelectors = [
          'article', '.article-content', '.post-content', '.entry-content',
          'main', '[role="main"]', '.main-content'
        ];

        for (const selector of contentSelectors) {
          try {
            await page.waitForSelector(selector, { timeout: 3000 });
            break;
          } catch (e) {
            // Continue to next selector
          }
        }
      },

      // Strategy 4: Handle "Read More" or expansion buttons
      async () => {
        const expandButtons = [
          'button[class*="read-more"]', 'button[class*="expand"]',
          '.read-more', '.show-more', '.expand-content',
          'a[class*="read-more"]', 'a[class*="continue"]'
        ];

        for (const selector of expandButtons) {
          try {
            const button = await page.$(selector);
            if (button) {
              await button.click();
              await waitForTimeout(2000);
              break;
            }
          } catch (e) {
            // Continue to next button
          }
        }
      }
    ];

    // Execute strategies with timeouts
    const executeWithTimeout = async (strategy: () => Promise<void>, timeout: number) => {
      return Promise.race([
        strategy(),
        new Promise<void>((resolve) => setTimeout(resolve, timeout))
      ]);
    };

    // Execute all strategies in parallel with timeouts
    await Promise.allSettled([
      executeWithTimeout(loadingStrategies[0]!, 3000),
      executeWithTimeout(loadingStrategies[1]!, 8000),
      executeWithTimeout(loadingStrategies[2]!, 5000),
      executeWithTimeout(loadingStrategies[3]!, 3000)
    ]);

    // Final wait for any remaining dynamic content
    await waitForTimeout(2000);

  } catch (error) {
    console.log('Dynamic content handling failed, using basic timeout:', error);
    // If dynamic content handling fails, continue with basic timeout
    await waitForTimeout(3000);
  }
  console.log('Dynamic content handling completed.');
}

async function getBrowser(): Promise<Browser> {
  if (!sharedBrowser || !sharedBrowser.isConnected()) {
    sharedBrowser = await puppeteer.launch({
      headless: true,
      args: [
        "--no-sandbox",
        "--disable-setuid-sandbox",
        "--disable-dev-shm-usage",
        "--disable-accelerated-2d-canvas",
        "--no-first-run",
        "--no-zygote",
        "--disable-gpu",
        "--disable-web-security",
        "--disable-features=VizDisplayCompositor",
      ],
    });
  }
  return sharedBrowser;
}

export async function closeBrowser(): Promise<void> {
  if (sharedBrowser && sharedBrowser.isConnected()) {
    await sharedBrowser.close();
    sharedBrowser = null;
  }
}

export async function extractArticleContent(
  url: string,
): Promise<ExtractedContent> {
  console.log(`Starting content extraction for: ${url}`);
  let page = null;
  try {
    const browser = await getBrowser();
    page = await browser.newPage();

    // Set user agent and viewport
    await page.setUserAgent(
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    );
    await page.setViewport({ width: 1280, height: 720 });

    // Set navigation timeout and disable images for faster loading
    page.setDefaultNavigationTimeout(45000);
    page.setDefaultTimeout(45000);

    // Block unnecessary resources to speed up loading
    await page.setRequestInterception(true);
    page.on('request', (req) => {
      const resourceType = req.resourceType();
      if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
        req.abort();
      } else {
        req.continue();
      }
    });

    // Navigate to the page with better waiting strategy
    const response = await page.goto(url, {
      waitUntil: "domcontentloaded",
      timeout: 45000,
    });

    if (!response || !response.ok()) {
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
    }

    // Enhanced dynamic content handling
    console.log('Handling dynamic content...');
    await handleDynamicContent(page);

    // Extract content using advanced multi-strategy approach
    console.log('Extracting content using multi-strategy approach...');
    const extractedData = await page.evaluate(() => {
      interface ContentCandidate {
        element: Element;
        score: number;
        content: string;
        selector: string;
      }

      // Remove unwanted elements first
      const unwantedSelectors = [
        "script", "style", "noscript", "iframe", "embed", "object",
        "nav", "header", "footer", "aside", "form",
        ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
        ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
        ".social-share", ".share", ".social", ".sns",
        ".comments", ".comment", ".disqus",
        ".cookie-banner", ".cookie", ".gdpr",
        ".popup", ".modal", ".overlay", ".lightbox",
        ".related", ".recommended", ".more-stories",
        ".tags", ".categories", ".metadata",
        ".author-bio", ".author-info",
        ".newsletter", ".subscribe", ".signup",
        "[role='complementary']", "[role='banner']", "[role='contentinfo']",
        "[aria-label*='advertisement']", "[aria-label*='sidebar']"
      ];

      unwantedSelectors.forEach((selector) => {
        try {
          const elements = document.querySelectorAll(selector);
          elements.forEach((el) => el.remove());
        } catch (e) {
          // Ignore invalid selectors
        }
      });

      // Extract title
      let title = "";
      const titleSources = [
        () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'),
        () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'),
        () => document.querySelector('h1')?.textContent?.trim(),
        () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(),
        () => document.querySelector('title')?.textContent?.trim(),
        () => document.querySelector('[itemprop="headline"]')?.textContent?.trim()
      ];

      for (const source of titleSources) {
        try {
          const result = source();
          if (result && result.length > 0) {
            title = result;
            break;
          }
        } catch (e) {
          continue;
        }
      }

      // Extract description
      let description = "";
      const descriptionSources = [
        () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'),
        () => document.querySelector('meta[name="description"]')?.getAttribute('content'),
        () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'),
        () => document.querySelector('[itemprop="description"]')?.textContent?.trim()
      ];

      for (const source of descriptionSources) {
        try {
          const result = source();
          if (result && result.length > 0) {
            description = result;
            break;
          }
        } catch (e) {
          continue;
        }
      }

      // Comprehensive content selectors with priorities
      const contentSelectors = [
        // Schema.org and structured data
        '[itemtype*="Article"] [itemprop="articleBody"]',
        '[itemtype*="NewsArticle"] [itemprop="articleBody"]',
        '[itemtype*="BlogPosting"] [itemprop="articleBody"]',

        // High-priority semantic selectors
        'article[role="main"]',
        'main article',
        '[role="main"] article',
        'article',

        // Common CMS and platform selectors
        '.post-content', '.entry-content', '.article-content', '.content-area',
        '.article-body', '.post-body', '.entry-body', '.story-body',
        '.main-content', '.primary-content', '.page-content',
        '.news-content', '.blog-content', '.editorial-content',

        // WordPress specific
        '.wp-content', '.entry', '.post',

        // Medium, Substack, Ghost
        '.section-content', '.postArticle-content', '.post-full-content',
        '.markup', '.section--body', '.section-divider + .section-content',

        // Japanese sites specific
        '.honbun', '.main_text', '.article_body', '.news_body',
        '.entry_text', '.blog_text', '.content_text',
        '.kiji', '.news', '.article',

        // Generic semantic HTML5
        'main', '[role="main"]',

        // ID-based selectors
        '#content', '#main', '#article', '#post', '#entry',
        '#main-content', '#primary', '#content-area',

        // Class-based common patterns
        '.content', '.main', '.wrapper', '.container',

        // Fallbacks
        'body'
      ];

      // Function to calculate content quality score
      function calculateContentScore(element: Element): number {
        if (!element) return 0;

        const text = element.textContent || '';
        if (text.length < 100) return 0;

        let score = 0;

        // Base score from text length (diminishing returns)
        score += Math.min(text.length / 100, 50);

        // Paragraph density
        const paragraphs = element.querySelectorAll('p');
        const avgParagraphLength = paragraphs.length > 0 ?
          Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0;

        if (avgParagraphLength > 100) score += 20;
        if (paragraphs.length > 3) score += 10;

        // Link density penalty (articles shouldn't be mostly links)
        const links = element.querySelectorAll('a');
        const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
        const linkDensity = text.length > 0 ? linkText / text.length : 0;
        if (linkDensity < 0.2) score += 15;
        else if (linkDensity < 0.4) score += 5;
        else score -= 10;

        // Bonus for article-like structure
        if (element.tagName === 'ARTICLE') score += 25;
        if (element.getAttribute('role') === 'main') score += 20;
        if (element.querySelector('h1, h2, h3')) score += 10;

        // Bonus for semantic elements
        const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
        if (semanticElements.length > 5) score += 15;

        // Penalty for too many images without text
        const images = element.querySelectorAll('img');
        if (images.length > text.length / 500) score -= 5;

        // Penalty for navigation-like content
        const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
        const className = element.className.toLowerCase();
        const id = element.id.toLowerCase();
        if (navWords.some(word => className.includes(word) || id.includes(word))) {
          score -= 20;
        }

        return Math.max(score, 0);
      }

      // Function to clean and normalize text
      function cleanText(text: string): string {
        return text
          .replace(/\s+/g, ' ')  // Normalize whitespace
          .replace(/\n\s*\n\s*\n/g, '\n\n')  // Reduce excessive line breaks
          .replace(/^\s+|\s+$/g, '')  // Trim
          .replace(/[\u200B-\u200D\uFEFF]/g, '')  // Remove zero-width characters
          .trim();
      }

      // Collect and score all content candidates
      const candidates: ContentCandidate[] = [];

      for (const selector of contentSelectors) {
        try {
          const elements = document.querySelectorAll(selector);
          elements.forEach((element, index) => {
            const text = element.textContent || '';
            if (text.length > 200) {  // Minimum content threshold
              const score = calculateContentScore(element);
              candidates.push({
                element,
                score,
                content: cleanText(text),
                selector: `${selector}[${index}]`
              });
            }
          });
        } catch (e) {
          // Skip invalid selectors
          continue;
        }
      }

      // Sort candidates by score (highest first)
      candidates.sort((a, b) => b.score - a.score);

      console.log(`Found ${candidates.length} content candidates`);
      if (candidates.length > 0) {
        console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
      }

      // Get the best content
      let content = "";
      if (candidates.length > 0) {
        content = candidates[0].content;

        // If the best candidate is still short, try combining top candidates
        if (content.length < 500 && candidates.length > 1) {
          const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
          const combinedContent = topCandidates.map(c => c.content).join('\n\n');
          if (combinedContent.length > content.length) {
            content = cleanText(combinedContent);
          }
        }
      }

      // Fallback strategies if still no good content
      if (!content || content.length < 200) {
        // Try paragraph aggregation
        const paragraphs = Array.from(document.querySelectorAll('p'))
          .map(p => p.textContent?.trim() || '')
          .filter(p => p.length > 50)
          .filter(p => {
            // Filter out likely navigation/boilerplate paragraphs
            const lowerP = p.toLowerCase();
            return !lowerP.includes('cookie') &&
                   !lowerP.includes('privacy') &&
                   !lowerP.includes('terms of service') &&
                   !lowerP.includes('subscribe') &&
                   !lowerP.includes('newsletter');
          });

        if (paragraphs.length > 0) {
          content = cleanText(paragraphs.join('\n\n'));
        }
      }

      // Final fallback: structured data
      if (!content || content.length < 200) {
        try {
          const jsonLd = document.querySelector('script[type="application/ld+json"]');
          if (jsonLd) {
            const data = JSON.parse(jsonLd.textContent || '{}');
            if (data.articleBody) {
              content = cleanText(data.articleBody);
            } else if (data.text) {
              content = cleanText(data.text);
            }
          }
        } catch (e) {
          // Ignore JSON parsing errors
        }
      }

      console.log(`Final content length: ${content.length} characters`);
      return { title, content, description };
    });

    // Validate extracted content with more lenient threshold
    if (!extractedData.content || extractedData.content.length < 100) {
      // Try one more extraction attempt with relaxed criteria
      const fallbackData = await page.evaluate(() => {
        // Last resort: extract all text from body, excluding common noise
        const body = document.body;
        if (body) {
          // Clone body to avoid modifying original
          const bodyClone = body.cloneNode(true) as Element;

          // Remove noise elements from clone
          const noiseSelectors = [
            'script', 'style', 'nav', 'header', 'footer', 'aside',
            '.ad', '.ads', '.advertisement', '[class*="ad-"]',
            '.menu', '.navigation', '.sidebar', '.social',
            '.cookie', '.popup', '.modal'
          ];

          noiseSelectors.forEach(selector => {
            const elements = bodyClone.querySelectorAll(selector);
            elements.forEach(el => el.remove());
          });

          const text = bodyClone.textContent || '';
          return text.replace(/\s+/g, ' ').trim();
        }
        return '';
      });

      if (fallbackData && fallbackData.length > 200) {
        extractedData.content = fallbackData;
      } else {
        return {
          title: extractedData.title,
          content: extractedData.content || "",
          description: extractedData.description,
          success: false,
          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
        };
      }
    }

    // Limit content length to avoid token limits
    const maxLength = 50000;
    let content = extractedData.content;
    if (content.length > maxLength) {
      content = content.substring(0, maxLength) + "...";
    }

    console.log(`Successfully extracted content: ${content.length} characters`);
    return {
      title: extractedData.title,
      content,
      description: extractedData.description,
      success: true,
    };
  } catch (error) {
    console.error(`Content extraction failed for ${url}:`, error);
    return {
      title: "",
      content: "",
      description: "",
      success: false,
      error: error instanceof Error ? error.message : "Unknown error occurred",
    };
  } finally {
    if (page) {
      await page.close();
    }
  }
}

export async function enhanceArticleContent(
  _originalTitle: string,
  originalLink: string,
  originalContent?: string,
  originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
  // If we already have substantial content, use it
  const existingContent = originalContent || originalDescription || "";
  if (existingContent.length > 500) {
    return {
      content: originalContent,
      description: originalDescription,
    };
  }

  // Try to extract content from the URL
  const extracted = await extractArticleContent(originalLink);

  if (extracted.success && extracted.content) {
    return {
      content: extracted.content,
      description: extracted.description || originalDescription,
    };
  }

  // Return original content if extraction failed
  return {
    content: originalContent,
    description: originalDescription,
  };
}