VoiceRSSSummary/services/content-extractor.ts

import puppeteer, { type Browser } from "puppeteer";

export interface ExtractedContent {
  title?: string;
  content: string;
  description?: string;
  success: boolean;
  error?: string;
}

// Singleton browser instance for reuse
let sharedBrowser: Browser | null = null;

async function getBrowser(): Promise<Browser> {
  if (!sharedBrowser || !sharedBrowser.isConnected()) {
    sharedBrowser = await puppeteer.launch({
      headless: true,
      args: [
        "--no-sandbox",
        "--disable-setuid-sandbox",
        "--disable-dev-shm-usage",
        "--disable-accelerated-2d-canvas",
        "--no-first-run",
        "--no-zygote",
        "--disable-gpu",
        "--disable-web-security",
        "--disable-features=VizDisplayCompositor",
      ],
    });
  }
  return sharedBrowser;
}

export async function closeBrowser(): Promise<void> {
  if (sharedBrowser && sharedBrowser.isConnected()) {
    await sharedBrowser.close();
    sharedBrowser = null;
  }
}

export async function extractArticleContent(
  url: string,
): Promise<ExtractedContent> {
  let page = null;
  try {
    const browser = await getBrowser();
    page = await browser.newPage();

    // Set user agent and viewport
    await page.setUserAgent(
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    );
    await page.setViewport({ width: 1280, height: 720 });

    // Set navigation timeout
    page.setDefaultNavigationTimeout(30000);
    page.setDefaultTimeout(30000);

    // Navigate to the page
    const response = await page.goto(url, {
      waitUntil: "networkidle2",
      timeout: 30000,
    });

    if (!response || !response.ok()) {
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
    }

    // Wait for potential dynamic content
    await new Promise(resolve => setTimeout(resolve, 2000));

    // Extract content using page.evaluate
    const extractedData = await page.evaluate(() => {
      // Remove unwanted elements
      const unwantedSelectors = [
        "script",
        "style",
        "nav",
        "header",
        "footer",
        "aside",
        ".advertisement",
        ".ads",
        ".sidebar",
        ".menu",
        ".navigation",
        ".social-share",
        ".comments",
        ".cookie-banner",
        ".popup",
        ".modal",
      ];

      unwantedSelectors.forEach((selector) => {
        const elements = document.querySelectorAll(selector);
        elements.forEach((el) => el.remove());
      });

      let content = "";
      let title = "";
      let description = "";

      // Extract title
      const titleElement = document.querySelector("title");
      const h1Element = document.querySelector("h1");
      const ogTitleMeta = document.querySelector('meta[property="og:title"]');

      title =
        titleElement?.textContent?.trim() ||
        h1Element?.textContent?.trim() ||
        ogTitleMeta?.getAttribute("content") ||
        "";

      // Extract description
      const descriptionMeta = document.querySelector('meta[name="description"]');
      const ogDescriptionMeta = document.querySelector(
        'meta[property="og:description"]',
      );

      description =
        descriptionMeta?.getAttribute("content") ||
        ogDescriptionMeta?.getAttribute("content") ||
        "";

      // Try multiple content extraction strategies
      const contentSelectors = [
        // Common article selectors
        "article",
        '[role="main"]',
        ".article-content",
        ".post-content",
        ".entry-content",
        ".content",
        ".main-content",
        ".article-body",
        ".post-body",
        ".story-body",
        ".news-content",

        // Japanese news site specific selectors
        ".article",
        ".news-article",
        ".post",
        ".entry",
        "#content",
        "#main",
        ".main",

        // Fallback to common containers
        ".container",
        "#container",
        "main",
        "body",
      ];

      for (const selector of contentSelectors) {
        const element = document.querySelector(selector);
        if (element) {
          // Get text content and clean it up
          let extractedText = element.textContent?.trim() || "";

          // Remove extra whitespace and normalize
          extractedText = extractedText
            .replace(/\s+/g, " ")
            .replace(/\n\s*\n/g, "\n")
            .trim();

          // Only use if we found substantial content
          if (extractedText.length > 200) {
            content = extractedText;
            break;
          }
        }
      }

      // If still no content, try paragraph extraction
      if (!content) {
        const paragraphs = Array.from(document.querySelectorAll("p"))
          .map((p) => p.textContent?.trim() || "")
          .filter((p) => p.length > 50); // Filter out short paragraphs
        content = paragraphs.join("\n\n");
      }

      // Final fallback: use body text
      if (!content || content.length < 100) {
        const bodyText = document.body?.textContent || "";
        content = bodyText.replace(/\s+/g, " ").trim();
      }

      return { title, content, description };
    });

    // Validate extracted content
    if (!extractedData.content || extractedData.content.length < 50) {
      return {
        title: extractedData.title,
        content: "",
        description: extractedData.description,
        success: false,
        error: "Insufficient content extracted",
      };
    }

    // Limit content length to avoid token limits
    const maxLength = 5000;
    let content = extractedData.content;
    if (content.length > maxLength) {
      content = content.substring(0, maxLength) + "...";
    }

    return {
      title: extractedData.title,
      content,
      description: extractedData.description,
      success: true,
    };
  } catch (error) {
    return {
      title: "",
      content: "",
      description: "",
      success: false,
      error: error instanceof Error ? error.message : "Unknown error occurred",
    };
  } finally {
    if (page) {
      await page.close();
    }
  }
}

export async function enhanceArticleContent(
  _originalTitle: string,
  originalLink: string,
  originalContent?: string,
  originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
  // If we already have substantial content, use it
  const existingContent = originalContent || originalDescription || "";
  if (existingContent.length > 500) {
    return {
      content: originalContent,
      description: originalDescription,
    };
  }

  // Try to extract content from the URL
  const extracted = await extractArticleContent(originalLink);

  if (extracted.success && extracted.content) {
    return {
      content: extracted.content,
      description: extracted.description || originalDescription,
    };
  }

  // Return original content if extraction failed
  return {
    content: originalContent,
    description: originalDescription,
  };
}