VoiceRSSSummary/services/content-extractor.ts

import * as cheerio from "cheerio";

export interface ExtractedContent {
  title?: string;
  content: string;
  description?: string;
  success: boolean;
  error?: string;
}

export async function extractArticleContent(
  url: string,
): Promise<ExtractedContent> {
  try {
    // Fetch the HTML content
    const response = await fetch(url, {
      headers: {
        "User-Agent":
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        Accept:
          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
        "Accept-Encoding": "gzip, deflate",
        Connection: "keep-alive",
        "Upgrade-Insecure-Requests": "1",
      },
      signal: AbortSignal.timeout(30000), // 30 second timeout
    });

    if (!response.ok) {
      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
    }

    const html = await response.text();
    const $ = cheerio.load(html);

    // Remove unwanted elements
    $(
      "script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
    ).remove();

    let content = "";
    let title = "";
    let description = "";

    // Extract title
    title =
      $("title").text().trim() ||
      $("h1").first().text().trim() ||
      $('meta[property="og:title"]').attr("content") ||
      "";

    // Extract description
    description =
      $('meta[name="description"]').attr("content") ||
      $('meta[property="og:description"]').attr("content") ||
      "";

    // Try multiple content extraction strategies
    const contentSelectors = [
      // Common article selectors
      "article",
      '[role="main"]',
      ".article-content",
      ".post-content",
      ".entry-content",
      ".content",
      ".main-content",
      ".article-body",
      ".post-body",
      ".story-body",
      ".news-content",

      // Japanese news site specific selectors
      ".article",
      ".news-article",
      ".post",
      ".entry",
      "#content",
      "#main",
      ".main",

      // Fallback to common containers
      ".container",
      "#container",
      "main",
      "body",
    ];

    for (const selector of contentSelectors) {
      const element = $(selector);
      if (element.length > 0) {
        // Get text content and clean it up
        let extractedText = element.text().trim();

        // Remove extra whitespace and normalize
        extractedText = extractedText
          .replace(/\s+/g, " ")
          .replace(/\n\s*\n/g, "\n")
          .trim();

        // Only use if we found substantial content
        if (extractedText.length > 200) {
          content = extractedText;
          break;
        }
      }
    }

    // If still no content, try paragraph extraction
    if (!content) {
      const paragraphs = $("p")
        .map((_, el) => $(el).text().trim())
        .get();
      content = paragraphs
        .filter((p) => p.length > 50) // Filter out short paragraphs
        .join("\n\n");
    }

    // Final fallback: use body text
    if (!content || content.length < 100) {
      content = $("body").text().replace(/\s+/g, " ").trim();
    }

    // Validate extracted content
    if (!content || content.length < 50) {
      return {
        title,
        content: "",
        description,
        success: false,
        error: "Insufficient content extracted",
      };
    }

    // Limit content length to avoid token limits
    const maxLength = 5000;
    if (content.length > maxLength) {
      content = content.substring(0, maxLength) + "...";
    }

    return {
      title,
      content,
      description,
      success: true,
    };
  } catch (error) {
    return {
      title: "",
      content: "",
      description: "",
      success: false,
      error: error instanceof Error ? error.message : "Unknown error occurred",
    };
  }
}

export async function enhanceArticleContent(
  originalTitle: string,
  originalLink: string,
  originalContent?: string,
  originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
  // If we already have substantial content, use it
  const existingContent = originalContent || originalDescription || "";
  if (existingContent.length > 500) {
    return {
      content: originalContent,
      description: originalDescription,
    };
  }

  // Try to extract content from the URL
  const extracted = await extractArticleContent(originalLink);

  if (extracted.success && extracted.content) {
    return {
      content: extracted.content,
      description: extracted.description || originalDescription,
    };
  }

  // Return original content if extraction failed
  return {
    content: originalContent,
    description: originalDescription,
  };
}