Apply formatting

2025-06-08 15:21:58 +09:00
parent b5ff912fcb
commit a728ebb66c
28 changed files with 1809 additions and 1137 deletions
--- a/services/content-extractor.ts
+++ b/services/content-extractor.ts
@@ -1,4 +1,4 @@
-import * as cheerio from 'cheerio';
+import * as cheerio from "cheerio";

 export interface ExtractedContent {
  title?: string;
@@ -8,17 +8,21 @@ export interface ExtractedContent {
  error?: string;
 }

-export async function extractArticleContent(url: string): Promise<ExtractedContent> {
+export async function extractArticleContent(
+  url: string,
+): Promise<ExtractedContent> {
  try {
    // Fetch the HTML content
    const response = await fetch(url, {
      headers: {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
-        'Accept-Encoding': 'gzip, deflate',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1',
+        "User-Agent":
+          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        Accept:
+          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
+        "Accept-Encoding": "gzip, deflate",
+        Connection: "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
      },
      signal: AbortSignal.timeout(30000), // 30 second timeout
    });
@@ -31,52 +35,56 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte
    const $ = cheerio.load(html);

    // Remove unwanted elements
-    $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
+    $(
+      "script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
+    ).remove();

-    let content = '';
-    let title = '';
-    let description = '';
+    let content = "";
+    let title = "";
+    let description = "";

    // Extract title
-    title = $('title').text().trim() || 
-            $('h1').first().text().trim() || 
-            $('meta[property="og:title"]').attr('content') || 
-            '';
+    title =
+      $("title").text().trim() ||
+      $("h1").first().text().trim() ||
+      $('meta[property="og:title"]').attr("content") ||
+      "";

    // Extract description
-    description = $('meta[name="description"]').attr('content') || 
-                  $('meta[property="og:description"]').attr('content') || 
-                  '';
+    description =
+      $('meta[name="description"]').attr("content") ||
+      $('meta[property="og:description"]').attr("content") ||
+      "";

    // Try multiple content extraction strategies
    const contentSelectors = [
      // Common article selectors
-      'article',
+      "article",
      '[role="main"]',
-      '.article-content',
-      '.post-content',
-      '.entry-content',
-      '.content',
-      '.main-content',
-      '.article-body',
-      '.post-body',
-      '.story-body',
-      '.news-content',
-      
+      ".article-content",
+      ".post-content",
+      ".entry-content",
+      ".content",
+      ".main-content",
+      ".article-body",
+      ".post-body",
+      ".story-body",
+      ".news-content",
+
      // Japanese news site specific selectors
-      '.article',
-      '.news-article',
-      '.post',
-      '.entry',
-      '#content',
-      '#main',
-      '.main',
-      
+      ".article",
+      ".news-article",
+      ".post",
+      ".entry",
+      "#content",
+      "#main",
+      ".main",
+
      // Fallback to common containers
-      '.container',
-      '#container',
-      'main',
-      'body'
+      ".container",
+      "#container",
+      "main",
+      "body",
    ];

    for (const selector of contentSelectors) {
@@ -84,11 +92,11 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte
      if (element.length > 0) {
        // Get text content and clean it up
        let extractedText = element.text().trim();
-        
+
        // Remove extra whitespace and normalize
        extractedText = extractedText
-          .replace(/\s+/g, ' ')
-          .replace(/\n\s*\n/g, '\n')
+          .replace(/\s+/g, " ")
+          .replace(/\n\s*\n/g, "\n")
          .trim();

        // Only use if we found substantial content
@@ -101,50 +109,49 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte

    // If still no content, try paragraph extraction
    if (!content) {
-      const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
+      const paragraphs = $("p")
+        .map((_, el) => $(el).text().trim())
+        .get();
      content = paragraphs
-        .filter(p => p.length > 50) // Filter out short paragraphs
-        .join('\n\n');
+        .filter((p) => p.length > 50) // Filter out short paragraphs
+        .join("\n\n");
    }

    // Final fallback: use body text
    if (!content || content.length < 100) {
-      content = $('body').text()
-        .replace(/\s+/g, ' ')
-        .trim();
+      content = $("body").text().replace(/\s+/g, " ").trim();
    }

    // Validate extracted content
    if (!content || content.length < 50) {
      return {
        title,
-        content: '',
+        content: "",
        description,
        success: false,
-        error: 'Insufficient content extracted'
+        error: "Insufficient content extracted",
      };
    }

    // Limit content length to avoid token limits
    const maxLength = 5000;
    if (content.length > maxLength) {
-      content = content.substring(0, maxLength) + '...';
+      content = content.substring(0, maxLength) + "...";
    }

    return {
      title,
      content,
      description,
-      success: true
+      success: true,
    };
-
  } catch (error) {
    return {
-      title: '',
-      content: '',
-      description: '',
+      title: "",
+      content: "",
+      description: "",
      success: false,
-      error: error instanceof Error ? error.message : 'Unknown error occurred'
+      error: error instanceof Error ? error.message : "Unknown error occurred",
    };
  }
 }
@@ -153,30 +160,30 @@ export async function enhanceArticleContent(
  originalTitle: string,
  originalLink: string,
  originalContent?: string,
-  originalDescription?: string
+  originalDescription?: string,
 ): Promise<{ content?: string; description?: string }> {
  // If we already have substantial content, use it
-  const existingContent = originalContent || originalDescription || '';
+  const existingContent = originalContent || originalDescription || "";
  if (existingContent.length > 500) {
    return {
      content: originalContent,
-      description: originalDescription
+      description: originalDescription,
    };
  }

  // Try to extract content from the URL
  const extracted = await extractArticleContent(originalLink);
-  
+
  if (extracted.success && extracted.content) {
    return {
      content: extracted.content,
-      description: extracted.description || originalDescription
+      description: extracted.description || originalDescription,
    };
  }

  // Return original content if extraction failed
  return {
    content: originalContent,
-    description: originalDescription
+    description: originalDescription,
  };
-}
+}