diff --git a/scripts/fetch_and_generate.ts b/scripts/fetch_and_generate.ts index f956279..d5be1d3 100644 --- a/scripts/fetch_and_generate.ts +++ b/scripts/fetch_and_generate.ts @@ -4,6 +4,7 @@ import { openAI_GeneratePodcastContent, } from "../services/llm.js"; import { generateTTS, generateTTSWithoutQueue } from "../services/tts.js"; +import { enhanceArticleContent } from "../services/content-extractor.js"; import { saveFeed, getFeedByUrl, @@ -423,11 +424,22 @@ async function generatePodcastForArticle(article: any, abortSignal?: AbortSignal throw new Error('Podcast generation was cancelled'); } + // Enhance article content with web scraping if needed + console.log(`🔍 Enhancing content for: ${article.title}`); + const enhancedContent = await enhanceArticleContent( + article.title, + article.link, + article.content, + article.description + ); + // Generate podcast content for this single article const podcastContent = await openAI_GeneratePodcastContent(article.title, [ { title: article.title, link: article.link, + content: enhancedContent.content, + description: enhancedContent.description, }, ]); diff --git a/services/content-extractor.ts b/services/content-extractor.ts new file mode 100644 index 0000000..4b8fe4f --- /dev/null +++ b/services/content-extractor.ts @@ -0,0 +1,182 @@ +import * as cheerio from 'cheerio'; + +export interface ExtractedContent { + title?: string; + content: string; + description?: string; + success: boolean; + error?: string; +} + +export async function extractArticleContent(url: string): Promise { + try { + // Fetch the HTML content + const response = await fetch(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3', + 'Accept-Encoding': 'gzip, deflate', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }, + signal: AbortSignal.timeout(30000), // 30 second timeout + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + const $ = cheerio.load(html); + + // Remove unwanted elements + $('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove(); + + let content = ''; + let title = ''; + let description = ''; + + // Extract title + title = $('title').text().trim() || + $('h1').first().text().trim() || + $('meta[property="og:title"]').attr('content') || + ''; + + // Extract description + description = $('meta[name="description"]').attr('content') || + $('meta[property="og:description"]').attr('content') || + ''; + + // Try multiple content extraction strategies + const contentSelectors = [ + // Common article selectors + 'article', + '[role="main"]', + '.article-content', + '.post-content', + '.entry-content', + '.content', + '.main-content', + '.article-body', + '.post-body', + '.story-body', + '.news-content', + + // Japanese news site specific selectors + '.article', + '.news-article', + '.post', + '.entry', + '#content', + '#main', + '.main', + + // Fallback to common containers + '.container', + '#container', + 'main', + 'body' + ]; + + for (const selector of contentSelectors) { + const element = $(selector); + if (element.length > 0) { + // Get text content and clean it up + let extractedText = element.text().trim(); + + // Remove extra whitespace and normalize + extractedText = extractedText + .replace(/\s+/g, ' ') + .replace(/\n\s*\n/g, '\n') + .trim(); + + // Only use if we found substantial content + if (extractedText.length > 200) { + content = extractedText; + break; + } + } + } + + // If still no content, try paragraph extraction + if (!content) { + const paragraphs = $('p').map((_, el) => $(el).text().trim()).get(); + content = paragraphs + .filter(p => p.length > 50) // Filter out short paragraphs + .join('\n\n'); + } + + // Final fallback: use body text + if (!content || content.length < 100) { + content = $('body').text() + .replace(/\s+/g, ' ') + .trim(); + } + + // Validate extracted content + if (!content || content.length < 50) { + return { + title, + content: '', + description, + success: false, + error: 'Insufficient content extracted' + }; + } + + // Limit content length to avoid token limits + const maxLength = 5000; + if (content.length > maxLength) { + content = content.substring(0, maxLength) + '...'; + } + + return { + title, + content, + description, + success: true + }; + + } catch (error) { + return { + title: '', + content: '', + description: '', + success: false, + error: error instanceof Error ? error.message : 'Unknown error occurred' + }; + } +} + +export async function enhanceArticleContent( + originalTitle: string, + originalLink: string, + originalContent?: string, + originalDescription?: string +): Promise<{ content?: string; description?: string }> { + // If we already have substantial content, use it + const existingContent = originalContent || originalDescription || ''; + if (existingContent.length > 500) { + return { + content: originalContent, + description: originalDescription + }; + } + + // Try to extract content from the URL + const extracted = await extractArticleContent(originalLink); + + if (extracted.success && extracted.content) { + return { + content: extracted.content, + description: extracted.description || originalDescription + }; + } + + // Return original content if extraction failed + return { + content: originalContent, + description: originalDescription + }; +} \ No newline at end of file diff --git a/services/llm.ts b/services/llm.ts index f3f623a..ba86d99 100644 --- a/services/llm.ts +++ b/services/llm.ts @@ -59,7 +59,7 @@ export async function openAI_ClassifyFeed(title: string): Promise { export async function openAI_GeneratePodcastContent( title: string, - items: Array<{ title: string; link: string }>, + items: Array<{ title: string; link: string; content?: string; description?: string }>, ): Promise { if (!title || title.trim() === "") { throw new Error("Feed title is required for podcast content generation"); @@ -77,24 +77,42 @@ export async function openAI_GeneratePodcastContent( throw new Error("No valid news items found (title and link required)"); } + // Build detailed article information including content + const articleDetails = validItems.map((item, i) => { + let articleInfo = `${i + 1}. タむトル: ${item.title}\nURL: ${item.link}`; + + // Add content if available + const content = item.content || item.description; + if (content && content.trim()) { + // Limit content length to avoid token limits + const maxContentLength = 2000; + const truncatedContent = content.length > maxContentLength + ? content.substring(0, maxContentLength) + "..." + : content; + articleInfo += `\n内容: ${truncatedContent}`; + } + + return articleInfo; + }).join("\n\n"); + const prompt = ` あなたはプロのポッドキャスタです。以䞋に瀺すフィヌドタむトルに基づき、そのトピックに関する詳现なポッドキャスト原皿を䜜成しおください。 フィヌドタむトル: ${title} 関連するニュヌス蚘事: -${validItems.map((item, i) => `${i + 1}. ${item.title} - ${item.link}`).join("\n")} +${articleDetails} 以䞋の芁件を満たしおください: 1. もし英単語が含たれおいる堎合は、すべおカタカナに倉換しおください (䟋: "Google" → "グヌグル") 2. もし英語の文が含たれおいる堎合は、すべお日本語に翻蚳しおください -3. 各ニュヌス蚘事の内容を芁玄し、関連性を説明しおください -4. 芖聎者にずっおの䟡倀や興味ポむントを解説しおください +3. 各ニュヌス蚘事の具䜓的な内容を基に詳现な芁玄ず解説を行っおください +4. 蚘事の内容から重芁なポむントを抜出し、芖聎者にずっおの䟡倀や興味ポむントを解説しおください 5. 箄1000文字〜1500文字皋床の長さにしおください 6. 自然な日本語の口語衚珟を䜿っおください 7. トピック党䜓のたずめで締めくくっおください -この構成でポッドキャスト原皿を曞いおください。 +蚘事の実際の内容を掻甚しお、具䜓的で䟡倀のあるポッドキャスト原皿を䜜成しおください。 `; try {