Update
This commit is contained in:
182
services/content-extractor.ts
Normal file
182
services/content-extractor.ts
Normal file
@ -0,0 +1,182 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export interface ExtractedContent {
|
||||
title?: string;
|
||||
content: string;
|
||||
description?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export async function extractArticleContent(url: string): Promise<ExtractedContent> {
|
||||
try {
|
||||
// Fetch the HTML content
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
signal: AbortSignal.timeout(30000), // 30 second timeout
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Remove unwanted elements
|
||||
$('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
|
||||
|
||||
let content = '';
|
||||
let title = '';
|
||||
let description = '';
|
||||
|
||||
// Extract title
|
||||
title = $('title').text().trim() ||
|
||||
$('h1').first().text().trim() ||
|
||||
$('meta[property="og:title"]').attr('content') ||
|
||||
'';
|
||||
|
||||
// Extract description
|
||||
description = $('meta[name="description"]').attr('content') ||
|
||||
$('meta[property="og:description"]').attr('content') ||
|
||||
'';
|
||||
|
||||
// Try multiple content extraction strategies
|
||||
const contentSelectors = [
|
||||
// Common article selectors
|
||||
'article',
|
||||
'[role="main"]',
|
||||
'.article-content',
|
||||
'.post-content',
|
||||
'.entry-content',
|
||||
'.content',
|
||||
'.main-content',
|
||||
'.article-body',
|
||||
'.post-body',
|
||||
'.story-body',
|
||||
'.news-content',
|
||||
|
||||
// Japanese news site specific selectors
|
||||
'.article',
|
||||
'.news-article',
|
||||
'.post',
|
||||
'.entry',
|
||||
'#content',
|
||||
'#main',
|
||||
'.main',
|
||||
|
||||
// Fallback to common containers
|
||||
'.container',
|
||||
'#container',
|
||||
'main',
|
||||
'body'
|
||||
];
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const element = $(selector);
|
||||
if (element.length > 0) {
|
||||
// Get text content and clean it up
|
||||
let extractedText = element.text().trim();
|
||||
|
||||
// Remove extra whitespace and normalize
|
||||
extractedText = extractedText
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/\n\s*\n/g, '\n')
|
||||
.trim();
|
||||
|
||||
// Only use if we found substantial content
|
||||
if (extractedText.length > 200) {
|
||||
content = extractedText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If still no content, try paragraph extraction
|
||||
if (!content) {
|
||||
const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
|
||||
content = paragraphs
|
||||
.filter(p => p.length > 50) // Filter out short paragraphs
|
||||
.join('\n\n');
|
||||
}
|
||||
|
||||
// Final fallback: use body text
|
||||
if (!content || content.length < 100) {
|
||||
content = $('body').text()
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
// Validate extracted content
|
||||
if (!content || content.length < 50) {
|
||||
return {
|
||||
title,
|
||||
content: '',
|
||||
description,
|
||||
success: false,
|
||||
error: 'Insufficient content extracted'
|
||||
};
|
||||
}
|
||||
|
||||
// Limit content length to avoid token limits
|
||||
const maxLength = 5000;
|
||||
if (content.length > maxLength) {
|
||||
content = content.substring(0, maxLength) + '...';
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
content,
|
||||
description,
|
||||
success: true
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
return {
|
||||
title: '',
|
||||
content: '',
|
||||
description: '',
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : 'Unknown error occurred'
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function enhanceArticleContent(
|
||||
originalTitle: string,
|
||||
originalLink: string,
|
||||
originalContent?: string,
|
||||
originalDescription?: string
|
||||
): Promise<{ content?: string; description?: string }> {
|
||||
// If we already have substantial content, use it
|
||||
const existingContent = originalContent || originalDescription || '';
|
||||
if (existingContent.length > 500) {
|
||||
return {
|
||||
content: originalContent,
|
||||
description: originalDescription
|
||||
};
|
||||
}
|
||||
|
||||
// Try to extract content from the URL
|
||||
const extracted = await extractArticleContent(originalLink);
|
||||
|
||||
if (extracted.success && extracted.content) {
|
||||
return {
|
||||
content: extracted.content,
|
||||
description: extracted.description || originalDescription
|
||||
};
|
||||
}
|
||||
|
||||
// Return original content if extraction failed
|
||||
return {
|
||||
content: originalContent,
|
||||
description: originalDescription
|
||||
};
|
||||
}
|
Reference in New Issue
Block a user