Update

2025-06-08 15:21:27 +09:00
parent ca236067e8
commit b5ff912fcb
3 changed files with 217 additions and 5 deletions
--- a/services/content-extractor.ts
+++ b/services/content-extractor.ts
@@ -0,0 +1,182 @@
+import * as cheerio from 'cheerio';
+
+export interface ExtractedContent {
+  title?: string;
+  content: string;
+  description?: string;
+  success: boolean;
+  error?: string;
+}
+
+export async function extractArticleContent(url: string): Promise<ExtractedContent> {
+  try {
+    // Fetch the HTML content
+    const response = await fetch(url, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
+        'Accept-Encoding': 'gzip, deflate',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+      },
+      signal: AbortSignal.timeout(30000), // 30 second timeout
+    });
+
+    if (!response.ok) {
+      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+    }
+
+    const html = await response.text();
+    const $ = cheerio.load(html);
+
+    // Remove unwanted elements
+    $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
+
+    let content = '';
+    let title = '';
+    let description = '';
+
+    // Extract title
+    title = $('title').text().trim() || 
+            $('h1').first().text().trim() || 
+            $('meta[property="og:title"]').attr('content') || 
+            '';
+
+    // Extract description
+    description = $('meta[name="description"]').attr('content') || 
+                  $('meta[property="og:description"]').attr('content') || 
+                  '';
+
+    // Try multiple content extraction strategies
+    const contentSelectors = [
+      // Common article selectors
+      'article',
+      '[role="main"]',
+      '.article-content',
+      '.post-content',
+      '.entry-content',
+      '.content',
+      '.main-content',
+      '.article-body',
+      '.post-body',
+      '.story-body',
+      '.news-content',
+      
+      // Japanese news site specific selectors
+      '.article',
+      '.news-article',
+      '.post',
+      '.entry',
+      '#content',
+      '#main',
+      '.main',
+      
+      // Fallback to common containers
+      '.container',
+      '#container',
+      'main',
+      'body'
+    ];
+
+    for (const selector of contentSelectors) {
+      const element = $(selector);
+      if (element.length > 0) {
+        // Get text content and clean it up
+        let extractedText = element.text().trim();
+        
+        // Remove extra whitespace and normalize
+        extractedText = extractedText
+          .replace(/\s+/g, ' ')
+          .replace(/\n\s*\n/g, '\n')
+          .trim();
+
+        // Only use if we found substantial content
+        if (extractedText.length > 200) {
+          content = extractedText;
+          break;
+        }
+      }
+    }
+
+    // If still no content, try paragraph extraction
+    if (!content) {
+      const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
+      content = paragraphs
+        .filter(p => p.length > 50) // Filter out short paragraphs
+        .join('\n\n');
+    }
+
+    // Final fallback: use body text
+    if (!content || content.length < 100) {
+      content = $('body').text()
+        .replace(/\s+/g, ' ')
+        .trim();
+    }
+
+    // Validate extracted content
+    if (!content || content.length < 50) {
+      return {
+        title,
+        content: '',
+        description,
+        success: false,
+        error: 'Insufficient content extracted'
+      };
+    }
+
+    // Limit content length to avoid token limits
+    const maxLength = 5000;
+    if (content.length > maxLength) {
+      content = content.substring(0, maxLength) + '...';
+    }
+
+    return {
+      title,
+      content,
+      description,
+      success: true
+    };
+
+  } catch (error) {
+    return {
+      title: '',
+      content: '',
+      description: '',
+      success: false,
+      error: error instanceof Error ? error.message : 'Unknown error occurred'
+    };
+  }
+}
+
+export async function enhanceArticleContent(
+  originalTitle: string,
+  originalLink: string,
+  originalContent?: string,
+  originalDescription?: string
+): Promise<{ content?: string; description?: string }> {
+  // If we already have substantial content, use it
+  const existingContent = originalContent || originalDescription || '';
+  if (existingContent.length > 500) {
+    return {
+      content: originalContent,
+      description: originalDescription
+    };
+  }
+
+  // Try to extract content from the URL
+  const extracted = await extractArticleContent(originalLink);
+  
+  if (extracted.success && extracted.content) {
+    return {
+      content: extracted.content,
+      description: extracted.description || originalDescription
+    };
+  }
+
+  // Return original content if extraction failed
+  return {
+    content: originalContent,
+    description: originalDescription
+  };
+}