Update

2025-06-08 15:21:27 +09:00
parent ca236067e8
commit b5ff912fcb
3 changed files with 217 additions and 5 deletions
@@ -4,6 +4,7 @@ import {
  openAI_GeneratePodcastContent,
 } from "../services/llm.js";
 import { generateTTS, generateTTSWithoutQueue } from "../services/tts.js";
+import { enhanceArticleContent } from "../services/content-extractor.js";
 import {
  saveFeed,
  getFeedByUrl,
@@ -423,11 +424,22 @@ async function generatePodcastForArticle(article: any, abortSignal?: AbortSignal
      throw new Error('Podcast generation was cancelled');
    }

+    // Enhance article content with web scraping if needed
+    console.log(`🔍 Enhancing content for: ${article.title}`);
+    const enhancedContent = await enhanceArticleContent(
+      article.title,
+      article.link,
+      article.content,
+      article.description
+    );
+
    // Generate podcast content for this single article
    const podcastContent = await openAI_GeneratePodcastContent(article.title, [
      {
        title: article.title,
        link: article.link,
+        content: enhancedContent.content,
+        description: enhancedContent.description,
      },
    ]);
    
@@ -0,0 +1,182 @@
+import * as cheerio from 'cheerio';
+
+export interface ExtractedContent {
+  title?: string;
+  content: string;
+  description?: string;
+  success: boolean;
+  error?: string;
+}
+
+export async function extractArticleContent(url: string): Promise<ExtractedContent> {
+  try {
+    // Fetch the HTML content
+    const response = await fetch(url, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
+        'Accept-Encoding': 'gzip, deflate',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+      },
+      signal: AbortSignal.timeout(30000), // 30 second timeout
+    });
+
+    if (!response.ok) {
+      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+    }
+
+    const html = await response.text();
+    const $ = cheerio.load(html);
+
+    // Remove unwanted elements
+    $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
+
+    let content = '';
+    let title = '';
+    let description = '';
+
+    // Extract title
+    title = $('title').text().trim() || 
+            $('h1').first().text().trim() || 
+            $('meta[property="og:title"]').attr('content') || 
+            '';
+
+    // Extract description
+    description = $('meta[name="description"]').attr('content') || 
+                  $('meta[property="og:description"]').attr('content') || 
+                  '';
+
+    // Try multiple content extraction strategies
+    const contentSelectors = [
+      // Common article selectors
+      'article',
+      '[role="main"]',
+      '.article-content',
+      '.post-content',
+      '.entry-content',
+      '.content',
+      '.main-content',
+      '.article-body',
+      '.post-body',
+      '.story-body',
+      '.news-content',
+      
+      // Japanese news site specific selectors
+      '.article',
+      '.news-article',
+      '.post',
+      '.entry',
+      '#content',
+      '#main',
+      '.main',
+      
+      // Fallback to common containers
+      '.container',
+      '#container',
+      'main',
+      'body'
+    ];
+
+    for (const selector of contentSelectors) {
+      const element = $(selector);
+      if (element.length > 0) {
+        // Get text content and clean it up
+        let extractedText = element.text().trim();
+        
+        // Remove extra whitespace and normalize
+        extractedText = extractedText
+          .replace(/\s+/g, ' ')
+          .replace(/\n\s*\n/g, '\n')
+          .trim();
+
+        // Only use if we found substantial content
+        if (extractedText.length > 200) {
+          content = extractedText;
+          break;
+        }
+      }
+    }
+
+    // If still no content, try paragraph extraction
+    if (!content) {
+      const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
+      content = paragraphs
+        .filter(p => p.length > 50) // Filter out short paragraphs
+        .join('\n\n');
+    }
+
+    // Final fallback: use body text
+    if (!content || content.length < 100) {
+      content = $('body').text()
+        .replace(/\s+/g, ' ')
+        .trim();
+    }
+
+    // Validate extracted content
+    if (!content || content.length < 50) {
+      return {
+        title,
+        content: '',
+        description,
+        success: false,
+        error: 'Insufficient content extracted'
+      };
+    }
+
+    // Limit content length to avoid token limits
+    const maxLength = 5000;
+    if (content.length > maxLength) {
+      content = content.substring(0, maxLength) + '...';
+    }
+
+    return {
+      title,
+      content,
+      description,
+      success: true
+    };
+
+  } catch (error) {
+    return {
+      title: '',
+      content: '',
+      description: '',
+      success: false,
+      error: error instanceof Error ? error.message : 'Unknown error occurred'
+    };
+  }
+}
+
+export async function enhanceArticleContent(
+  originalTitle: string,
+  originalLink: string,
+  originalContent?: string,
+  originalDescription?: string
+): Promise<{ content?: string; description?: string }> {
+  // If we already have substantial content, use it
+  const existingContent = originalContent || originalDescription || '';
+  if (existingContent.length > 500) {
+    return {
+      content: originalContent,
+      description: originalDescription
+    };
+  }
+
+  // Try to extract content from the URL
+  const extracted = await extractArticleContent(originalLink);
+  
+  if (extracted.success && extracted.content) {
+    return {
+      content: extracted.content,
+      description: extracted.description || originalDescription
+    };
+  }
+
+  // Return original content if extraction failed
+  return {
+    content: originalContent,
+    description: originalDescription
+  };
+}
@@ -59,7 +59,7 @@ export async function openAI_ClassifyFeed(title: string): Promise<string> {

 export async function openAI_GeneratePodcastContent(
  title: string,
-  items: Array<{ title: string; link: string }>,
+  items: Array<{ title: string; link: string; content?: string; description?: string }>,
 ): Promise<string> {
  if (!title || title.trim() === "") {
    throw new Error("Feed title is required for podcast content generation");
@@ -77,24 +77,42 @@ export async function openAI_GeneratePodcastContent(
    throw new Error("No valid news items found (title and link required)");
  }

+  // Build detailed article information including content
+  const articleDetails = validItems.map((item, i) => {
+    let articleInfo = `${i + 1}. タイトル: ${item.title}\nURL: ${item.link}`;
+    
+    // Add content if available
+    const content = item.content || item.description;
+    if (content && content.trim()) {
+      // Limit content length to avoid token limits
+      const maxContentLength = 2000;
+      const truncatedContent = content.length > maxContentLength 
+        ? content.substring(0, maxContentLength) + "..." 
+        : content;
+      articleInfo += `\n内容: ${truncatedContent}`;
+    }
+    
+    return articleInfo;
+  }).join("\n\n");
+
  const prompt = `
 あなたはプロのポッドキャスタです。以下に示すフィードタイトルに基づき、そのトピックに関する詳細なポッドキャスト原稿を作成してください。

 フィードタイトル: ${title}

 関連するニュース記事:
-${validItems.map((item, i) => `${i + 1}. ${item.title} - ${item.link}`).join("\n")}
+${articleDetails}

 以下の要件を満たしてください:
 1. もし英単語が含まれている場合は、すべてカタカナに変換してください (例: "Google" → "グーグル")
 2. もし英語の文が含まれている場合は、すべて日本語に翻訳してください
-3. 各ニュース記事の内容を要約し、関連性を説明してください
-4. 視聴者にとっての価値や興味ポイントを解説してください
+3. 各ニュース記事の具体的な内容を基に詳細な要約と解説を行ってください
+4. 記事の内容から重要なポイントを抽出し、視聴者にとっての価値や興味ポイントを解説してください
 5. 約1000文字〜1500文字程度の長さにしてください
 6. 自然な日本語の口語表現を使ってください
 7. トピック全体のまとめで締めくくってください

-この構成でポッドキャスト原稿を書いてください。
+記事の実際の内容を活用して、具体的で価値のあるポッドキャスト原稿を作成してください。
 `;

  try {