Files
VoiceRSSSummary/services/content-extractor.ts
Satsuki Akiba 99af4d11b1
Some checks failed
CI / lint-and-test (push) Failing after 4m2s
CI / docker-test (push) Has been skipped
CI / security-scan (push) Has been skipped
Build and Publish Docker Images / build (push) Failing after 2m8s
Fix
2025-06-12 13:09:38 +09:00

1059 lines
37 KiB
TypeScript

import puppeteer, { type Browser } from "puppeteer";
import * as cheerio from "cheerio";
import type { CheerioAPI } from "cheerio";
export interface ExtractedContent {
title?: string;
content: string;
description?: string;
success: boolean;
error?: string;
}
interface RetryOptions {
maxRetries: number;
baseDelay: number;
maxDelay: number;
backoffMultiplier: number;
}
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
maxRetries: 3,
baseDelay: 1000,
maxDelay: 10000,
backoffMultiplier: 2
};
// Singleton browser instance for reuse
let sharedBrowser: Browser | null = null;
// Helper function to replace page.waitForTimeout
async function waitForTimeout(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// Dynamic content handling function
async function handleDynamicContent(page: any): Promise<void> {
try {
console.log('Starting dynamic content handling...');
// Wait for initial content
await page.waitForSelector('body', { timeout: 5000 });
// Progressive loading strategy
const loadingStrategies = [
// Strategy 1: Wait for common loading indicators to disappear
async () => {
const loadingSelectors = [
'.loading', '.loader', '.spinner', '.skeleton',
'[class*="loading"]', '[class*="skeleton"]',
'.placeholder', '.shimmer'
];
for (const selector of loadingSelectors) {
try {
await page.waitForSelector(selector, { timeout: 2000 });
await page.waitForSelector(selector, { hidden: true, timeout: 10000 });
break;
} catch (e) {
// Continue to next selector
}
}
},
// Strategy 2: Auto-scroll to trigger lazy loading
async () => {
await page.evaluate(() => {
return new Promise<void>((resolve) => {
let totalHeight = 0;
const distance = 500;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight || totalHeight > 5000) {
clearInterval(timer);
window.scrollTo(0, 0); // Scroll back to top
setTimeout(() => resolve(), 1000);
}
}, 200);
});
});
},
// Strategy 3: Wait for content-specific indicators
async () => {
const contentSelectors = [
'article', '.article-content', '.post-content', '.entry-content',
'main', '[role="main"]', '.main-content'
];
for (const selector of contentSelectors) {
try {
await page.waitForSelector(selector, { timeout: 3000 });
break;
} catch (e) {
// Continue to next selector
}
}
},
// Strategy 4: Handle "Read More" or expansion buttons
async () => {
const expandButtons = [
'button[class*="read-more"]', 'button[class*="expand"]',
'.read-more', '.show-more', '.expand-content',
'a[class*="read-more"]', 'a[class*="continue"]'
];
for (const selector of expandButtons) {
try {
const button = await page.$(selector);
if (button) {
await button.click();
await waitForTimeout(2000);
break;
}
} catch (e) {
// Continue to next button
}
}
}
];
// Execute strategies with timeouts
const executeWithTimeout = async (strategy: () => Promise<void>, timeout: number) => {
return Promise.race([
strategy(),
new Promise<void>((resolve) => setTimeout(resolve, timeout))
]);
};
// Execute all strategies in parallel with timeouts
await Promise.allSettled([
executeWithTimeout(loadingStrategies[0]!, 3000),
executeWithTimeout(loadingStrategies[1]!, 8000),
executeWithTimeout(loadingStrategies[2]!, 5000),
executeWithTimeout(loadingStrategies[3]!, 3000)
]);
// Final wait for any remaining dynamic content
await waitForTimeout(2000);
} catch (error) {
console.log('Dynamic content handling failed, using basic timeout:', error);
// If dynamic content handling fails, continue with basic timeout
await waitForTimeout(3000);
}
console.log('Dynamic content handling completed.');
}
async function getBrowser(): Promise<Browser> {
if (!sharedBrowser || !sharedBrowser.isConnected()) {
sharedBrowser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--no-first-run",
"--no-zygote",
"--disable-gpu",
"--disable-web-security",
"--disable-features=VizDisplayCompositor",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-field-trial-config",
"--disable-ipc-flooding-protection",
"--enable-automation",
"--force-device-scale-factor=1",
"--ignore-certificate-errors",
"--ignore-ssl-errors",
"--ignore-certificate-errors-spki-list",
"--allow-running-insecure-content",
"--disable-extensions",
"--no-default-browser-check",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--no-pings",
"--mute-audio"
],
});
}
return sharedBrowser;
}
// Helper function for exponential backoff retry
async function retryWithBackoff<T>(
operation: () => Promise<T>,
options: RetryOptions = DEFAULT_RETRY_OPTIONS,
attempt: number = 1
): Promise<T> {
try {
return await operation();
} catch (error) {
if (attempt >= options.maxRetries) {
throw error;
}
const isRetryableError = error instanceof Error && (
error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
error.message.includes('ERR_CONNECTION_REFUSED') ||
error.message.includes('ERR_CONNECTION_RESET') ||
error.message.includes('ERR_NETWORK_CHANGED') ||
error.message.includes('ERR_INTERNET_DISCONNECTED') ||
error.message.includes('ERR_NAME_NOT_RESOLVED') ||
error.message.includes('ERR_TIMED_OUT') ||
error.message.includes('Protocol error') ||
error.message.includes('Navigation timeout') ||
error.message.includes('net::') ||
error.message.includes('Target closed') ||
error.message.includes('Session closed')
);
if (!isRetryableError) {
throw error;
}
const delay = Math.min(
options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
options.maxDelay
);
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
await waitForTimeout(delay);
return retryWithBackoff(operation, options, attempt + 1);
}
}
export async function closeBrowser(): Promise<void> {
if (sharedBrowser && sharedBrowser.isConnected()) {
await sharedBrowser.close();
sharedBrowser = null;
}
}
// Fallback content extraction using fetch + cheerio
async function extractWithFetchFallback(url: string): Promise<ExtractedContent> {
console.log(`Using fetch fallback for: ${url}`);
try {
const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36";
const response = await fetch(url, {
headers: {
'User-Agent': userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'no-cache'
},
signal: AbortSignal.timeout(30000) // 30 second timeout
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
// Remove unwanted elements first
const unwantedSelectors = [
"script", "style", "noscript", "iframe", "embed", "object",
"nav", "header", "footer", "aside", "form",
".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
".social-share", ".share", ".social", ".sns",
".comments", ".comment", ".disqus",
".cookie-banner", ".cookie", ".gdpr",
".popup", ".modal", ".overlay", ".lightbox",
".related", ".recommended", ".more-stories",
".tags", ".categories", ".metadata",
".author-bio", ".author-info",
".newsletter", ".subscribe", ".signup",
"[role='complementary']", "[role='banner']", "[role='contentinfo']",
"[aria-label*='advertisement']", "[aria-label*='sidebar']"
];
unwantedSelectors.forEach((selector) => {
$(selector).remove();
});
// Extract title
let title = "";
const titleSources = [
$('meta[property="og:title"]').attr('content'),
$('meta[name="twitter:title"]').attr('content'),
$('h1').first().text().trim(),
$('.article-title, .post-title, .entry-title').first().text().trim(),
$('title').text().trim(),
$('[itemprop="headline"]').first().text().trim()
];
for (const titleSource of titleSources) {
if (titleSource && titleSource.length > 0) {
title = titleSource;
break;
}
}
// Extract description
let description = "";
const descriptionSources = [
$('meta[property="og:description"]').attr('content'),
$('meta[name="description"]').attr('content'),
$('meta[name="twitter:description"]').attr('content'),
$('[itemprop="description"]').first().text().trim()
];
for (const descSource of descriptionSources) {
if (descSource && descSource.length > 0) {
description = descSource;
break;
}
}
// Content selectors (same as in Puppeteer version)
const contentSelectors = [
'[itemtype*="Article"] [itemprop="articleBody"]',
'[itemtype*="NewsArticle"] [itemprop="articleBody"]',
'[itemtype*="BlogPosting"] [itemprop="articleBody"]',
'article[role="main"]',
'main article',
'[role="main"] article',
'article',
'.post-content', '.entry-content', '.article-content', '.content-area',
'.article-body', '.post-body', '.entry-body', '.story-body',
'.main-content', '.primary-content', '.page-content',
'.news-content', '.blog-content', '.editorial-content',
'.wp-content', '.entry', '.post',
'.section-content', '.postArticle-content', '.post-full-content',
'.markup', '.section--body', '.section-divider + .section-content',
'.honbun', '.main_text', '.article_body', '.news_body',
'.entry_text', '.blog_text', '.content_text',
'.kiji', '.news', '.article',
'main', '[role="main"]',
'#content', '#main', '#article', '#post', '#entry',
'#main-content', '#primary', '#content-area',
'.content', '.main', '.wrapper', '.container'
];
// Function to calculate content quality score
const calculateContentScore = (element: cheerio.Cheerio<any>): number => {
const text = element.text() || '';
if (text.length < 100) return 0;
let score = 0;
// Base score from text length (diminishing returns)
score += Math.min(text.length / 100, 50);
// Paragraph density
const paragraphs = element.find('p');
const avgParagraphLength = paragraphs.length > 0 ?
paragraphs.toArray().reduce((sum, p) => sum + ($(p).text().length || 0), 0) / paragraphs.length : 0;
if (avgParagraphLength > 100) score += 20;
if (paragraphs.length > 3) score += 10;
// Link density penalty
const links = element.find('a');
const linkText = links.toArray().reduce((sum, link) => sum + ($(link).text().length || 0), 0);
const linkDensity = text.length > 0 ? linkText / text.length : 0;
if (linkDensity < 0.2) score += 15;
else if (linkDensity < 0.4) score += 5;
else score -= 10;
// Bonus for article-like structure
if (element.prop('tagName') === 'ARTICLE') score += 25;
if (element.attr('role') === 'main') score += 20;
if (element.find('h1, h2, h3').length > 0) score += 10;
// Bonus for semantic elements
const semanticElements = element.find('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
if (semanticElements.length > 5) score += 15;
// Penalty for navigation-like content
const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
const className = (element.attr('class') || '').toLowerCase();
const id = (element.attr('id') || '').toLowerCase();
if (navWords.some(word => className.includes(word) || id.includes(word))) {
score -= 20;
}
return Math.max(score, 0);
};
// Function to clean and normalize text
const cleanText = (text: string): string => {
return text
.replace(/\s+/g, ' ')
.replace(/\n\s*\n\s*\n/g, '\n\n')
.replace(/^\s+|\s+$/g, '')
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.trim();
};
// Collect and score all content candidates
interface ContentCandidate {
element: cheerio.Cheerio<any>;
score: number;
content: string;
selector: string;
}
const candidates: ContentCandidate[] = [];
for (const selector of contentSelectors) {
try {
const elements = $(selector);
elements.each((index, element) => {
const $element = $(element);
const text = $element.text() || '';
if (text.length > 200) {
const score = calculateContentScore($element);
candidates.push({
element: $element,
score,
content: cleanText(text),
selector: `${selector}[${index}]`
});
}
});
} catch (e) {
continue;
}
}
// Sort candidates by score (highest first)
candidates.sort((a, b) => b.score - a.score);
console.log(`Found ${candidates.length} content candidates`);
if (candidates.length > 0) {
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
}
// Get the best content
let content = "";
if (candidates.length > 0) {
content = candidates[0]!.content;
// If the best candidate is still short, try combining top candidates
if (content.length < 500 && candidates.length > 1) {
const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
const combinedContent = topCandidates.map(c => c.content).join('\n\n');
if (combinedContent.length > content.length) {
content = cleanText(combinedContent);
}
}
}
// Fallback strategies if still no good content
if (!content || content.length < 200) {
console.log('Using paragraph aggregation fallback...');
const paragraphs = $('p').toArray()
.map(p => $(p).text().trim())
.filter(p => p.length > 50)
.filter(p => {
const lowerP = p.toLowerCase();
return !lowerP.includes('cookie') &&
!lowerP.includes('privacy') &&
!lowerP.includes('terms of service') &&
!lowerP.includes('subscribe') &&
!lowerP.includes('newsletter');
});
if (paragraphs.length > 0) {
content = cleanText(paragraphs.join('\n\n'));
}
}
// Final fallback: structured data
if (!content || content.length < 200) {
console.log('Trying structured data fallback...');
try {
const jsonLd = $('script[type="application/ld+json"]').first().html();
if (jsonLd) {
const data = JSON.parse(jsonLd);
if (data.articleBody) {
content = cleanText(data.articleBody);
} else if (data.text) {
content = cleanText(data.text);
}
}
} catch (e) {
// Ignore JSON parsing errors
}
}
// Limit content length to avoid token limits
const maxLength = 50000;
if (content.length > maxLength) {
content = content.substring(0, maxLength) + "...";
}
console.log(`Fetch fallback extracted content: ${content.length} characters`);
if (!content || content.length < 100) {
return {
title: title || '',
content: '',
description: description || '',
success: false,
error: `Insufficient content extracted via fetch fallback (${content?.length || 0} characters)`,
};
}
return {
title: title || '',
content,
description: description || '',
success: true,
};
} catch (error) {
console.error(`Fetch fallback failed:`, error);
return {
title: '',
content: '',
description: '',
success: false,
error: error instanceof Error ? error.message : 'Unknown error in fetch fallback',
};
}
}
async function extractWithRetry(url: string): Promise<ExtractedContent> {
const userAgents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
];
return retryWithBackoff(async () => {
let page = null;
try {
const browser = await getBrowser();
page = await browser.newPage();
// Randomize user agent to avoid detection
const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
await page.setUserAgent(userAgent!);
await page.setViewport({ width: 1280, height: 720 });
// Set longer timeout for problematic sites
page.setDefaultNavigationTimeout(60000);
page.setDefaultTimeout(60000);
// Block unnecessary resources to speed up loading
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
req.abort();
} else {
req.continue();
}
});
// Add extra headers to appear more like a real browser
await page.setExtraHTTPHeaders({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
});
// Navigate with multiple wait strategies
let response;
try {
response = await page.goto(url, {
waitUntil: ["domcontentloaded", "networkidle0"],
timeout: 60000,
});
} catch (networkError) {
// Fallback to more basic wait strategy
const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 60000,
});
}
if (!response) {
throw new Error('No response received from server');
}
const status = response?.status();
if (status && status >= 400) {
throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
}
// Enhanced dynamic content handling
console.log('Handling dynamic content...');
await handleDynamicContent(page);
// Extract content using advanced multi-strategy approach
console.log('Extracting content using multi-strategy approach...');
const extractedData = await page.evaluate(() => {
interface ContentCandidate {
element: Element;
score: number;
content: string;
selector: string;
}
// Remove unwanted elements first
const unwantedSelectors = [
"script", "style", "noscript", "iframe", "embed", "object",
"nav", "header", "footer", "aside", "form",
".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
".social-share", ".share", ".social", ".sns",
".comments", ".comment", ".disqus",
".cookie-banner", ".cookie", ".gdpr",
".popup", ".modal", ".overlay", ".lightbox",
".related", ".recommended", ".more-stories",
".tags", ".categories", ".metadata",
".author-bio", ".author-info",
".newsletter", ".subscribe", ".signup",
"[role='complementary']", "[role='banner']", "[role='contentinfo']",
"[aria-label*='advertisement']", "[aria-label*='sidebar']"
];
unwantedSelectors.forEach((selector) => {
try {
const elements = document.querySelectorAll(selector);
elements.forEach((el) => el.remove());
} catch (e) {
// Ignore invalid selectors
}
});
// Extract title
let title = "";
const titleSources = [
() => document.querySelector('meta[property="og:title"]')?.getAttribute('content'),
() => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'),
() => document.querySelector('h1')?.textContent?.trim(),
() => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(),
() => document.querySelector('title')?.textContent?.trim(),
() => document.querySelector('[itemprop="headline"]')?.textContent?.trim()
];
for (const source of titleSources) {
try {
const result = source();
if (result && result.length > 0) {
title = result;
break;
}
} catch (e) {
continue;
}
}
// Extract description
let description = "";
const descriptionSources = [
() => document.querySelector('meta[property="og:description"]')?.getAttribute('content'),
() => document.querySelector('meta[name="description"]')?.getAttribute('content'),
() => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'),
() => document.querySelector('[itemprop="description"]')?.textContent?.trim()
];
for (const source of descriptionSources) {
try {
const result = source();
if (result && result.length > 0) {
description = result;
break;
}
} catch (e) {
continue;
}
}
// Comprehensive content selectors with priorities
const contentSelectors = [
// Schema.org and structured data
'[itemtype*="Article"] [itemprop="articleBody"]',
'[itemtype*="NewsArticle"] [itemprop="articleBody"]',
'[itemtype*="BlogPosting"] [itemprop="articleBody"]',
// High-priority semantic selectors
'article[role="main"]',
'main article',
'[role="main"] article',
'article',
// Common CMS and platform selectors
'.post-content', '.entry-content', '.article-content', '.content-area',
'.article-body', '.post-body', '.entry-body', '.story-body',
'.main-content', '.primary-content', '.page-content',
'.news-content', '.blog-content', '.editorial-content',
// WordPress specific
'.wp-content', '.entry', '.post',
// Medium, Substack, Ghost
'.section-content', '.postArticle-content', '.post-full-content',
'.markup', '.section--body', '.section-divider + .section-content',
// Japanese sites specific
'.honbun', '.main_text', '.article_body', '.news_body',
'.entry_text', '.blog_text', '.content_text',
'.kiji', '.news', '.article',
// Generic semantic HTML5
'main', '[role="main"]',
// ID-based selectors
'#content', '#main', '#article', '#post', '#entry',
'#main-content', '#primary', '#content-area',
// Class-based common patterns
'.content', '.main', '.wrapper', '.container',
// Fallbacks
'body'
];
// Function to calculate content quality score
function calculateContentScore(element: Element): number {
if (!element) return 0;
const text = element.textContent || '';
if (text.length < 100) return 0;
let score = 0;
// Base score from text length (diminishing returns)
score += Math.min(text.length / 100, 50);
// Paragraph density
const paragraphs = element.querySelectorAll('p');
const avgParagraphLength = paragraphs.length > 0 ?
Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0;
if (avgParagraphLength > 100) score += 20;
if (paragraphs.length > 3) score += 10;
// Link density penalty (articles shouldn't be mostly links)
const links = element.querySelectorAll('a');
const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
const linkDensity = text.length > 0 ? linkText / text.length : 0;
if (linkDensity < 0.2) score += 15;
else if (linkDensity < 0.4) score += 5;
else score -= 10;
// Bonus for article-like structure
if (element.tagName === 'ARTICLE') score += 25;
if (element.getAttribute('role') === 'main') score += 20;
if (element.querySelector('h1, h2, h3')) score += 10;
// Bonus for semantic elements
const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
if (semanticElements.length > 5) score += 15;
// Penalty for too many images without text
const images = element.querySelectorAll('img');
if (images.length > text.length / 500) score -= 5;
// Penalty for navigation-like content
const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
const className = element.className.toLowerCase();
const id = element.id.toLowerCase();
if (navWords.some(word => className.includes(word) || id.includes(word))) {
score -= 20;
}
return Math.max(score, 0);
}
// Function to clean and normalize text
function cleanText(text: string): string {
return text
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/\n\s*\n\s*\n/g, '\n\n') // Reduce excessive line breaks
.replace(/^\s+|\s+$/g, '') // Trim
.replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width characters
.trim();
}
// Collect and score all content candidates
const candidates: ContentCandidate[] = [];
for (const selector of contentSelectors) {
try {
const elements = document.querySelectorAll(selector);
elements.forEach((element, index) => {
const text = element.textContent || '';
if (text.length > 200) { // Minimum content threshold
const score = calculateContentScore(element);
candidates.push({
element,
score,
content: cleanText(text),
selector: `${selector}[${index}]`
});
}
});
} catch (e) {
// Skip invalid selectors
continue;
}
}
// Sort candidates by score (highest first)
candidates.sort((a, b) => b.score - a.score);
console.log(`Found ${candidates.length} content candidates`);
if (candidates.length > 0) {
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
}
// Get the best content
let content = "";
if (candidates.length > 0) {
content = candidates[0]!.content;
// If the best candidate is still short, try combining top candidates
if (content.length < 500 && candidates.length > 1) {
const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
const combinedContent = topCandidates.map(c => c.content).join('\n\n');
if (combinedContent.length > content.length) {
content = cleanText(combinedContent);
}
}
}
// Fallback strategies if still no good content
if (!content || content.length < 200) {
// Try paragraph aggregation
const paragraphs = Array.from(document.querySelectorAll('p'))
.map(p => p.textContent?.trim() || '')
.filter(p => p.length > 50)
.filter(p => {
// Filter out likely navigation/boilerplate paragraphs
const lowerP = p.toLowerCase();
return !lowerP.includes('cookie') &&
!lowerP.includes('privacy') &&
!lowerP.includes('terms of service') &&
!lowerP.includes('subscribe') &&
!lowerP.includes('newsletter');
});
if (paragraphs.length > 0) {
content = cleanText(paragraphs.join('\n\n'));
}
}
// Final fallback: structured data
if (!content || content.length < 200) {
try {
const jsonLd = document.querySelector('script[type="application/ld+json"]');
if (jsonLd) {
const data = JSON.parse(jsonLd.textContent || '{}');
if (data.articleBody) {
content = cleanText(data.articleBody);
} else if (data.text) {
content = cleanText(data.text);
}
}
} catch (e) {
// Ignore JSON parsing errors
}
}
console.log(`Final content length: ${content.length} characters`);
return { title, content, description };
});
// Validate extracted content with more lenient threshold
if (!extractedData.content || extractedData.content.length < 100) {
// Try one more extraction attempt with relaxed criteria
const fallbackData = await page.evaluate(() => {
// Last resort: extract all text from body, excluding common noise
const body = document.body;
if (body) {
// Clone body to avoid modifying original
const bodyClone = body.cloneNode(true) as Element;
// Remove noise elements from clone
const noiseSelectors = [
'script', 'style', 'nav', 'header', 'footer', 'aside',
'.ad', '.ads', '.advertisement', '[class*="ad-"]',
'.menu', '.navigation', '.sidebar', '.social',
'.cookie', '.popup', '.modal'
];
noiseSelectors.forEach(selector => {
const elements = bodyClone.querySelectorAll(selector);
elements.forEach(el => el.remove());
});
const text = bodyClone.textContent || '';
return text.replace(/\s+/g, ' ').trim();
}
return '';
});
if (fallbackData && fallbackData.length > 200) {
extractedData.content = fallbackData;
} else {
return {
title: extractedData.title || '',
content: extractedData.content || "",
description: extractedData.description || '',
success: false,
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
};
}
}
// Limit content length to avoid token limits
const maxLength = 50000;
let content = extractedData.content;
if (content.length > maxLength) {
content = content.substring(0, maxLength) + "...";
}
console.log(`Successfully extracted content: ${content.length} characters`);
return {
title: extractedData.title,
content,
description: extractedData.description,
success: true,
};
} catch (error) {
console.error(`Content extraction attempt failed:`, error);
throw error; // Let retry logic handle this
} finally {
if (page) {
try {
await page.close();
} catch (closeError) {
console.warn('Failed to close page:', closeError);
}
}
}
});
}
export async function extractArticleContent(
url: string,
): Promise<ExtractedContent> {
console.log(`Starting content extraction for: ${url}`);
try {
return await extractWithRetry(url);
} catch (error) {
console.error(`Content extraction failed after all retries for ${url}:`, error);
// Check if this is a Puppeteer launch/browser failure that should trigger fallback
const shouldUseFallback = error instanceof Error && (
error.message.includes('TimeoutError') ||
error.message.includes('Timed out after') ||
error.message.includes('waiting for the WS endpoint URL') ||
error.message.includes('Browser closed') ||
error.message.includes('Target closed') ||
error.message.includes('Session closed') ||
error.message.includes('Protocol error') ||
error.message.includes('Connection terminated') ||
error.message.includes('spawn') || // Process spawn errors
error.message.includes('ECONNRESET') ||
error.message.includes('ECONNREFUSED') ||
error.message.includes('ENOTFOUND')
);
if (shouldUseFallback) {
console.log(`Puppeteer failed, trying fetch fallback for ${url}`);
try {
const fallbackResult = await extractWithFetchFallback(url);
if (fallbackResult.success) {
console.log(`Fetch fallback succeeded for ${url}`);
return fallbackResult;
}
console.log(`Fetch fallback also failed for ${url}:`, fallbackResult.error);
} catch (fallbackError) {
console.error(`Fetch fallback threw error for ${url}:`, fallbackError);
}
}
// Provide more specific error messages
let errorMessage = "Unknown error occurred";
if (error instanceof Error) {
if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
errorMessage = "Network connection failed - server may be unreachable";
} else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
errorMessage = "Connection refused by server";
} else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
errorMessage = "DNS resolution failed - domain may not exist";
} else if (error.message.includes('ERR_TIMED_OUT')) {
errorMessage = "Request timed out - server too slow";
} else if (error.message.includes('HTTP 4')) {
errorMessage = `Client error: ${error.message}`;
} else if (error.message.includes('HTTP 5')) {
errorMessage = `Server error: ${error.message}`;
} else if (error.message.includes('TimeoutError')) {
errorMessage = "Puppeteer browser launch timeout - both Puppeteer and fetch fallback failed";
} else {
errorMessage = error.message;
}
}
return {
title: "",
content: "",
description: "",
success: false,
error: errorMessage,
};
}
}
export async function enhanceArticleContent(
_originalTitle: string,
originalLink: string,
originalContent?: string,
originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
// If we already have substantial content, use it
const existingContent = originalContent || originalDescription || "";
if (existingContent.length > 500) {
return {
content: originalContent,
description: originalDescription,
};
}
// Try to extract content from the URL
const extracted = await extractArticleContent(originalLink);
if (extracted.success && extracted.content) {
return {
content: extracted.content,
description: extracted.description || originalDescription,
};
}
// Return original content if extraction failed
return {
content: originalContent,
description: originalDescription,
};
}