Update

2025-06-08 15:21:27 +09:00
parent ca236067e8
commit b5ff912fcb
3 changed files with 217 additions and 5 deletions
--- a/scripts/fetch_and_generate.ts
+++ b/scripts/fetch_and_generate.ts
@@ -4,6 +4,7 @@ import {
  openAI_GeneratePodcastContent,
 } from "../services/llm.js";
 import { generateTTS, generateTTSWithoutQueue } from "../services/tts.js";
 import { enhanceArticleContent } from "../services/content-extractor.js";
 import {
  saveFeed,
  getFeedByUrl,
@@ -423,11 +424,22 @@ async function generatePodcastForArticle(article: any, abortSignal?: AbortSignal
      throw new Error('Podcast generation was cancelled');
    }
    // Enhance article content with web scraping if needed
    console.log(`🔍 Enhancing content for: ${article.title}`);
    const enhancedContent = await enhanceArticleContent(
      article.title,
      article.link,
      article.content,
      article.description
    );
    // Generate podcast content for this single article
    const podcastContent = await openAI_GeneratePodcastContent(article.title, [
      {
        title: article.title,
        link: article.link,
        content: enhancedContent.content,
        description: enhancedContent.description,
      },
    ]);
--- a/services/content-extractor.ts
+++ b/services/content-extractor.ts
@@ -0,0 +1,182 @@
 import * as cheerio from 'cheerio';
 export interface ExtractedContent {
  title?: string;
  content: string;
  description?: string;
  success: boolean;
  error?: string;
 }
 export async function extractArticleContent(url: string): Promise<ExtractedContent> {
  try {
    // Fetch the HTML content
    const response = await fetch(url, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
      },
      signal: AbortSignal.timeout(30000), // 30 second timeout
    });
    if (!response.ok) {
      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
    }
    const html = await response.text();
    const $ = cheerio.load(html);
    // Remove unwanted elements
    $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
    let content = '';
    let title = '';
    let description = '';
    // Extract title
    title = $('title').text().trim() || 
            $('h1').first().text().trim() || 
            $('meta[property="og:title"]').attr('content') || 
            '';
    // Extract description
    description = $('meta[name="description"]').attr('content') || 
                  $('meta[property="og:description"]').attr('content') || 
                  '';
    // Try multiple content extraction strategies
    const contentSelectors = [
      // Common article selectors
      'article',
      '[role="main"]',
      '.article-content',
      '.post-content',
      '.entry-content',
      '.content',
      '.main-content',
      '.article-body',
      '.post-body',
      '.story-body',
      '.news-content',
      // Japanese news site specific selectors
      '.article',
      '.news-article',
      '.post',
      '.entry',
      '#content',
      '#main',
      '.main',
      // Fallback to common containers
      '.container',
      '#container',
      'main',
      'body'
    ];
    for (const selector of contentSelectors) {
      const element = $(selector);
      if (element.length > 0) {
        // Get text content and clean it up
        let extractedText = element.text().trim();
        // Remove extra whitespace and normalize
        extractedText = extractedText
          .replace(/\s+/g, ' ')
          .replace(/\n\s*\n/g, '\n')
          .trim();
        // Only use if we found substantial content
        if (extractedText.length > 200) {
          content = extractedText;
          break;
        }
      }
    }
    // If still no content, try paragraph extraction
    if (!content) {
      const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
      content = paragraphs
        .filter(p => p.length > 50) // Filter out short paragraphs
        .join('\n\n');
    }
    // Final fallback: use body text
    if (!content || content.length < 100) {
      content = $('body').text()
        .replace(/\s+/g, ' ')
        .trim();
    }
    // Validate extracted content
    if (!content || content.length < 50) {
      return {
        title,
        content: '',
        description,
        success: false,
        error: 'Insufficient content extracted'
      };
    }
    // Limit content length to avoid token limits
    const maxLength = 5000;
    if (content.length > maxLength) {
      content = content.substring(0, maxLength) + '...';
    }
    return {
      title,
      content,
      description,
      success: true
    };
  } catch (error) {
    return {
      title: '',
      content: '',
      description: '',
      success: false,
      error: error instanceof Error ? error.message : 'Unknown error occurred'
    };
  }
 }
 export async function enhanceArticleContent(
  originalTitle: string,
  originalLink: string,
  originalContent?: string,
  originalDescription?: string
 ): Promise<{ content?: string; description?: string }> {
  // If we already have substantial content, use it
  const existingContent = originalContent || originalDescription || '';
  if (existingContent.length > 500) {
    return {
      content: originalContent,
      description: originalDescription
    };
  }
  // Try to extract content from the URL
  const extracted = await extractArticleContent(originalLink);
  if (extracted.success && extracted.content) {
    return {
      content: extracted.content,
      description: extracted.description || originalDescription
    };
  }
  // Return original content if extraction failed
  return {
    content: originalContent,
    description: originalDescription
  };
 }
--- a/services/llm.ts
+++ b/services/llm.ts
@@ -59,7 +59,7 @@ export async function openAI_ClassifyFeed(title: string): Promise<string> {
 export async function openAI_GeneratePodcastContent(
  title: string,
-  items: Array<{ title: string; link: string }>,
+  items: Array<{ title: string; link: string; content?: string; description?: string }>,
 ): Promise<string> {
  if (!title || title.trim() === "") {
    throw new Error("Feed title is required for podcast content generation");
@@ -77,24 +77,42 @@ export async function openAI_GeneratePodcastContent(
    throw new Error("No valid news items found (title and link required)");
  }
  // Build detailed article information including content
  const articleDetails = validItems.map((item, i) => {
    let articleInfo = `${i + 1}. タイトル: ${item.title}\nURL: ${item.link}`;
    // Add content if available
    const content = item.content || item.description;
    if (content && content.trim()) {
      // Limit content length to avoid token limits
      const maxContentLength = 2000;
      const truncatedContent = content.length > maxContentLength 
        ? content.substring(0, maxContentLength) + "..." 
        : content;
      articleInfo += `\n内容: ${truncatedContent}`;
    }
    return articleInfo;
  }).join("\n\n");
  const prompt = `
 あなたはプロのポッドキャスタです。以下に示すフィードタイトルに基づき、そのトピックに関する詳細なポッドキャスト原稿を作成してください。
 フィードタイトル: ${title}
 関連するニュース記事:
-${validItems.map((item, i) => `${i + 1}. ${item.title} - ${item.link}`).join("\n")}
+${articleDetails}
 以下の要件を満たしてください:
 1. もし英単語が含まれている場合は、すべてカタカナに変換してください (例: "Google" → "グーグル")
 2. もし英語の文が含まれている場合は、すべて日本語に翻訳してください
-3. 各ニュース記事の内容を要約し、関連性を説明してください
+3. 各ニュース記事の具体的な内容を基に詳細な要約と解説を行ってください
-4. 視聴者にとっての価値や興味ポイントを解説してください
+4. 記事の内容から重要なポイントを抽出し、視聴者にとっての価値や興味ポイントを解説してください
 5. 約1000文字〜1500文字程度の長さにしてください
 6. 自然な日本語の口語表現を使ってください
 7. トピック全体のまとめで締めくくってください
-この構成でポッドキャスト原稿を書いてください。
+記事の実際の内容を活用して、具体的で価値のあるポッドキャスト原稿を作成してください。
 `;
  try {