|
|
|
@ -1,4 +1,6 @@
|
|
|
|
|
import puppeteer, { type Browser } from "puppeteer";
|
|
|
|
|
import * as cheerio from "cheerio";
|
|
|
|
|
import type { CheerioAPI } from "cheerio";
|
|
|
|
|
|
|
|
|
|
export interface ExtractedContent {
|
|
|
|
|
title?: string;
|
|
|
|
@ -235,6 +237,300 @@ export async function closeBrowser(): Promise<void> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fallback content extraction using fetch + cheerio
|
|
|
|
|
async function extractWithFetchFallback(url: string): Promise<ExtractedContent> {
|
|
|
|
|
console.log(`Using fetch fallback for: ${url}`);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
|
|
|
|
|
|
|
|
|
|
const response = await fetch(url, {
|
|
|
|
|
headers: {
|
|
|
|
|
'User-Agent': userAgent,
|
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
|
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
|
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
|
|
'DNT': '1',
|
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
|
|
'Cache-Control': 'no-cache'
|
|
|
|
|
},
|
|
|
|
|
signal: AbortSignal.timeout(30000) // 30 second timeout
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (!response.ok) {
|
|
|
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const html = await response.text();
|
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
|
|
// Remove unwanted elements first
|
|
|
|
|
const unwantedSelectors = [
|
|
|
|
|
"script", "style", "noscript", "iframe", "embed", "object",
|
|
|
|
|
"nav", "header", "footer", "aside", "form",
|
|
|
|
|
".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
|
|
|
|
|
".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
|
|
|
|
|
".social-share", ".share", ".social", ".sns",
|
|
|
|
|
".comments", ".comment", ".disqus",
|
|
|
|
|
".cookie-banner", ".cookie", ".gdpr",
|
|
|
|
|
".popup", ".modal", ".overlay", ".lightbox",
|
|
|
|
|
".related", ".recommended", ".more-stories",
|
|
|
|
|
".tags", ".categories", ".metadata",
|
|
|
|
|
".author-bio", ".author-info",
|
|
|
|
|
".newsletter", ".subscribe", ".signup",
|
|
|
|
|
"[role='complementary']", "[role='banner']", "[role='contentinfo']",
|
|
|
|
|
"[aria-label*='advertisement']", "[aria-label*='sidebar']"
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
unwantedSelectors.forEach((selector) => {
|
|
|
|
|
$(selector).remove();
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Extract title
|
|
|
|
|
let title = "";
|
|
|
|
|
const titleSources = [
|
|
|
|
|
$('meta[property="og:title"]').attr('content'),
|
|
|
|
|
$('meta[name="twitter:title"]').attr('content'),
|
|
|
|
|
$('h1').first().text().trim(),
|
|
|
|
|
$('.article-title, .post-title, .entry-title').first().text().trim(),
|
|
|
|
|
$('title').text().trim(),
|
|
|
|
|
$('[itemprop="headline"]').first().text().trim()
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
for (const titleSource of titleSources) {
|
|
|
|
|
if (titleSource && titleSource.length > 0) {
|
|
|
|
|
title = titleSource;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Extract description
|
|
|
|
|
let description = "";
|
|
|
|
|
const descriptionSources = [
|
|
|
|
|
$('meta[property="og:description"]').attr('content'),
|
|
|
|
|
$('meta[name="description"]').attr('content'),
|
|
|
|
|
$('meta[name="twitter:description"]').attr('content'),
|
|
|
|
|
$('[itemprop="description"]').first().text().trim()
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
for (const descSource of descriptionSources) {
|
|
|
|
|
if (descSource && descSource.length > 0) {
|
|
|
|
|
description = descSource;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Content selectors (same as in Puppeteer version)
|
|
|
|
|
const contentSelectors = [
|
|
|
|
|
'[itemtype*="Article"] [itemprop="articleBody"]',
|
|
|
|
|
'[itemtype*="NewsArticle"] [itemprop="articleBody"]',
|
|
|
|
|
'[itemtype*="BlogPosting"] [itemprop="articleBody"]',
|
|
|
|
|
'article[role="main"]',
|
|
|
|
|
'main article',
|
|
|
|
|
'[role="main"] article',
|
|
|
|
|
'article',
|
|
|
|
|
'.post-content', '.entry-content', '.article-content', '.content-area',
|
|
|
|
|
'.article-body', '.post-body', '.entry-body', '.story-body',
|
|
|
|
|
'.main-content', '.primary-content', '.page-content',
|
|
|
|
|
'.news-content', '.blog-content', '.editorial-content',
|
|
|
|
|
'.wp-content', '.entry', '.post',
|
|
|
|
|
'.section-content', '.postArticle-content', '.post-full-content',
|
|
|
|
|
'.markup', '.section--body', '.section-divider + .section-content',
|
|
|
|
|
'.honbun', '.main_text', '.article_body', '.news_body',
|
|
|
|
|
'.entry_text', '.blog_text', '.content_text',
|
|
|
|
|
'.kiji', '.news', '.article',
|
|
|
|
|
'main', '[role="main"]',
|
|
|
|
|
'#content', '#main', '#article', '#post', '#entry',
|
|
|
|
|
'#main-content', '#primary', '#content-area',
|
|
|
|
|
'.content', '.main', '.wrapper', '.container'
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// Function to calculate content quality score
|
|
|
|
|
const calculateContentScore = (element: cheerio.Cheerio<any>): number => {
|
|
|
|
|
const text = element.text() || '';
|
|
|
|
|
if (text.length < 100) return 0;
|
|
|
|
|
|
|
|
|
|
let score = 0;
|
|
|
|
|
|
|
|
|
|
// Base score from text length (diminishing returns)
|
|
|
|
|
score += Math.min(text.length / 100, 50);
|
|
|
|
|
|
|
|
|
|
// Paragraph density
|
|
|
|
|
const paragraphs = element.find('p');
|
|
|
|
|
const avgParagraphLength = paragraphs.length > 0 ?
|
|
|
|
|
paragraphs.toArray().reduce((sum, p) => sum + ($(p).text().length || 0), 0) / paragraphs.length : 0;
|
|
|
|
|
|
|
|
|
|
if (avgParagraphLength > 100) score += 20;
|
|
|
|
|
if (paragraphs.length > 3) score += 10;
|
|
|
|
|
|
|
|
|
|
// Link density penalty
|
|
|
|
|
const links = element.find('a');
|
|
|
|
|
const linkText = links.toArray().reduce((sum, link) => sum + ($(link).text().length || 0), 0);
|
|
|
|
|
const linkDensity = text.length > 0 ? linkText / text.length : 0;
|
|
|
|
|
if (linkDensity < 0.2) score += 15;
|
|
|
|
|
else if (linkDensity < 0.4) score += 5;
|
|
|
|
|
else score -= 10;
|
|
|
|
|
|
|
|
|
|
// Bonus for article-like structure
|
|
|
|
|
if (element.prop('tagName') === 'ARTICLE') score += 25;
|
|
|
|
|
if (element.attr('role') === 'main') score += 20;
|
|
|
|
|
if (element.find('h1, h2, h3').length > 0) score += 10;
|
|
|
|
|
|
|
|
|
|
// Bonus for semantic elements
|
|
|
|
|
const semanticElements = element.find('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
|
|
|
|
|
if (semanticElements.length > 5) score += 15;
|
|
|
|
|
|
|
|
|
|
// Penalty for navigation-like content
|
|
|
|
|
const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
|
|
|
|
|
const className = (element.attr('class') || '').toLowerCase();
|
|
|
|
|
const id = (element.attr('id') || '').toLowerCase();
|
|
|
|
|
if (navWords.some(word => className.includes(word) || id.includes(word))) {
|
|
|
|
|
score -= 20;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return Math.max(score, 0);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Function to clean and normalize text
|
|
|
|
|
const cleanText = (text: string): string => {
|
|
|
|
|
return text
|
|
|
|
|
.replace(/\s+/g, ' ')
|
|
|
|
|
.replace(/\n\s*\n\s*\n/g, '\n\n')
|
|
|
|
|
.replace(/^\s+|\s+$/g, '')
|
|
|
|
|
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
|
|
|
|
.trim();
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Collect and score all content candidates
|
|
|
|
|
interface ContentCandidate {
|
|
|
|
|
element: cheerio.Cheerio<any>;
|
|
|
|
|
score: number;
|
|
|
|
|
content: string;
|
|
|
|
|
selector: string;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const candidates: ContentCandidate[] = [];
|
|
|
|
|
|
|
|
|
|
for (const selector of contentSelectors) {
|
|
|
|
|
try {
|
|
|
|
|
const elements = $(selector);
|
|
|
|
|
elements.each((index, element) => {
|
|
|
|
|
const $element = $(element);
|
|
|
|
|
const text = $element.text() || '';
|
|
|
|
|
if (text.length > 200) {
|
|
|
|
|
const score = calculateContentScore($element);
|
|
|
|
|
candidates.push({
|
|
|
|
|
element: $element,
|
|
|
|
|
score,
|
|
|
|
|
content: cleanText(text),
|
|
|
|
|
selector: `${selector}[${index}]`
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
} catch (e) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sort candidates by score (highest first)
|
|
|
|
|
candidates.sort((a, b) => b.score - a.score);
|
|
|
|
|
|
|
|
|
|
console.log(`Found ${candidates.length} content candidates`);
|
|
|
|
|
if (candidates.length > 0) {
|
|
|
|
|
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Get the best content
|
|
|
|
|
let content = "";
|
|
|
|
|
if (candidates.length > 0) {
|
|
|
|
|
content = candidates[0]!.content;
|
|
|
|
|
|
|
|
|
|
// If the best candidate is still short, try combining top candidates
|
|
|
|
|
if (content.length < 500 && candidates.length > 1) {
|
|
|
|
|
const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
|
|
|
|
|
const combinedContent = topCandidates.map(c => c.content).join('\n\n');
|
|
|
|
|
if (combinedContent.length > content.length) {
|
|
|
|
|
content = cleanText(combinedContent);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fallback strategies if still no good content
|
|
|
|
|
if (!content || content.length < 200) {
|
|
|
|
|
console.log('Using paragraph aggregation fallback...');
|
|
|
|
|
const paragraphs = $('p').toArray()
|
|
|
|
|
.map(p => $(p).text().trim())
|
|
|
|
|
.filter(p => p.length > 50)
|
|
|
|
|
.filter(p => {
|
|
|
|
|
const lowerP = p.toLowerCase();
|
|
|
|
|
return !lowerP.includes('cookie') &&
|
|
|
|
|
!lowerP.includes('privacy') &&
|
|
|
|
|
!lowerP.includes('terms of service') &&
|
|
|
|
|
!lowerP.includes('subscribe') &&
|
|
|
|
|
!lowerP.includes('newsletter');
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (paragraphs.length > 0) {
|
|
|
|
|
content = cleanText(paragraphs.join('\n\n'));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Final fallback: structured data
|
|
|
|
|
if (!content || content.length < 200) {
|
|
|
|
|
console.log('Trying structured data fallback...');
|
|
|
|
|
try {
|
|
|
|
|
const jsonLd = $('script[type="application/ld+json"]').first().html();
|
|
|
|
|
if (jsonLd) {
|
|
|
|
|
const data = JSON.parse(jsonLd);
|
|
|
|
|
if (data.articleBody) {
|
|
|
|
|
content = cleanText(data.articleBody);
|
|
|
|
|
} else if (data.text) {
|
|
|
|
|
content = cleanText(data.text);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// Ignore JSON parsing errors
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Limit content length to avoid token limits
|
|
|
|
|
const maxLength = 50000;
|
|
|
|
|
if (content.length > maxLength) {
|
|
|
|
|
content = content.substring(0, maxLength) + "...";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`Fetch fallback extracted content: ${content.length} characters`);
|
|
|
|
|
|
|
|
|
|
if (!content || content.length < 100) {
|
|
|
|
|
return {
|
|
|
|
|
title: title || '',
|
|
|
|
|
content: '',
|
|
|
|
|
description: description || '',
|
|
|
|
|
success: false,
|
|
|
|
|
error: `Insufficient content extracted via fetch fallback (${content?.length || 0} characters)`,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
title: title || '',
|
|
|
|
|
content,
|
|
|
|
|
description: description || '',
|
|
|
|
|
success: true,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Fetch fallback failed:`, error);
|
|
|
|
|
return {
|
|
|
|
|
title: '',
|
|
|
|
|
content: '',
|
|
|
|
|
description: '',
|
|
|
|
|
success: false,
|
|
|
|
|
error: error instanceof Error ? error.message : 'Unknown error in fetch fallback',
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function extractWithRetry(url: string): Promise<ExtractedContent> {
|
|
|
|
|
const userAgents = [
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
@ -667,6 +963,36 @@ export async function extractArticleContent(
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Content extraction failed after all retries for ${url}:`, error);
|
|
|
|
|
|
|
|
|
|
// Check if this is a Puppeteer launch/browser failure that should trigger fallback
|
|
|
|
|
const shouldUseFallback = error instanceof Error && (
|
|
|
|
|
error.message.includes('TimeoutError') ||
|
|
|
|
|
error.message.includes('Timed out after') ||
|
|
|
|
|
error.message.includes('waiting for the WS endpoint URL') ||
|
|
|
|
|
error.message.includes('Browser closed') ||
|
|
|
|
|
error.message.includes('Target closed') ||
|
|
|
|
|
error.message.includes('Session closed') ||
|
|
|
|
|
error.message.includes('Protocol error') ||
|
|
|
|
|
error.message.includes('Connection terminated') ||
|
|
|
|
|
error.message.includes('spawn') || // Process spawn errors
|
|
|
|
|
error.message.includes('ECONNRESET') ||
|
|
|
|
|
error.message.includes('ECONNREFUSED') ||
|
|
|
|
|
error.message.includes('ENOTFOUND')
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (shouldUseFallback) {
|
|
|
|
|
console.log(`Puppeteer failed, trying fetch fallback for ${url}`);
|
|
|
|
|
try {
|
|
|
|
|
const fallbackResult = await extractWithFetchFallback(url);
|
|
|
|
|
if (fallbackResult.success) {
|
|
|
|
|
console.log(`Fetch fallback succeeded for ${url}`);
|
|
|
|
|
return fallbackResult;
|
|
|
|
|
}
|
|
|
|
|
console.log(`Fetch fallback also failed for ${url}:`, fallbackResult.error);
|
|
|
|
|
} catch (fallbackError) {
|
|
|
|
|
console.error(`Fetch fallback threw error for ${url}:`, fallbackError);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Provide more specific error messages
|
|
|
|
|
let errorMessage = "Unknown error occurred";
|
|
|
|
|
if (error instanceof Error) {
|
|
|
|
@ -682,6 +1008,8 @@ export async function extractArticleContent(
|
|
|
|
|
errorMessage = `Client error: ${error.message}`;
|
|
|
|
|
} else if (error.message.includes('HTTP 5')) {
|
|
|
|
|
errorMessage = `Server error: ${error.message}`;
|
|
|
|
|
} else if (error.message.includes('TimeoutError')) {
|
|
|
|
|
errorMessage = "Puppeteer browser launch timeout - both Puppeteer and fetch fallback failed";
|
|
|
|
|
} else {
|
|
|
|
|
errorMessage = error.message;
|
|
|
|
|
}
|
|
|
|
|