Update content extractor
This commit is contained in:
@ -11,6 +11,122 @@ export interface ExtractedContent {
|
|||||||
// Singleton browser instance for reuse
|
// Singleton browser instance for reuse
|
||||||
let sharedBrowser: Browser | null = null;
|
let sharedBrowser: Browser | null = null;
|
||||||
|
|
||||||
|
// Dynamic content handling function
|
||||||
|
async function handleDynamicContent(page: any): Promise<void> {
|
||||||
|
try {
|
||||||
|
console.log('Starting dynamic content handling...');
|
||||||
|
// Wait for initial content
|
||||||
|
await page.waitForSelector('body', { timeout: 5000 });
|
||||||
|
|
||||||
|
// Progressive loading strategy
|
||||||
|
const loadingStrategies = [
|
||||||
|
// Strategy 1: Wait for common loading indicators to disappear
|
||||||
|
async () => {
|
||||||
|
const loadingSelectors = [
|
||||||
|
'.loading', '.loader', '.spinner', '.skeleton',
|
||||||
|
'[class*="loading"]', '[class*="skeleton"]',
|
||||||
|
'.placeholder', '.shimmer'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of loadingSelectors) {
|
||||||
|
try {
|
||||||
|
await page.waitForSelector(selector, { timeout: 2000 });
|
||||||
|
await page.waitForSelector(selector, { hidden: true, timeout: 10000 });
|
||||||
|
break;
|
||||||
|
} catch (e) {
|
||||||
|
// Continue to next selector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// Strategy 2: Auto-scroll to trigger lazy loading
|
||||||
|
async () => {
|
||||||
|
await page.evaluate(() => {
|
||||||
|
return new Promise<void>((resolve) => {
|
||||||
|
let totalHeight = 0;
|
||||||
|
const distance = 500;
|
||||||
|
const timer = setInterval(() => {
|
||||||
|
const scrollHeight = document.body.scrollHeight;
|
||||||
|
window.scrollBy(0, distance);
|
||||||
|
totalHeight += distance;
|
||||||
|
|
||||||
|
if (totalHeight >= scrollHeight || totalHeight > 5000) {
|
||||||
|
clearInterval(timer);
|
||||||
|
window.scrollTo(0, 0); // Scroll back to top
|
||||||
|
setTimeout(() => resolve(), 1000);
|
||||||
|
}
|
||||||
|
}, 200);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
// Strategy 3: Wait for content-specific indicators
|
||||||
|
async () => {
|
||||||
|
const contentSelectors = [
|
||||||
|
'article', '.article-content', '.post-content', '.entry-content',
|
||||||
|
'main', '[role="main"]', '.main-content'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of contentSelectors) {
|
||||||
|
try {
|
||||||
|
await page.waitForSelector(selector, { timeout: 3000 });
|
||||||
|
break;
|
||||||
|
} catch (e) {
|
||||||
|
// Continue to next selector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
// Strategy 4: Handle "Read More" or expansion buttons
|
||||||
|
async () => {
|
||||||
|
const expandButtons = [
|
||||||
|
'button[class*="read-more"]', 'button[class*="expand"]',
|
||||||
|
'.read-more', '.show-more', '.expand-content',
|
||||||
|
'a[class*="read-more"]', 'a[class*="continue"]'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const selector of expandButtons) {
|
||||||
|
try {
|
||||||
|
const button = await page.$(selector);
|
||||||
|
if (button) {
|
||||||
|
await button.click();
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Continue to next button
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
// Execute strategies with timeouts
|
||||||
|
const executeWithTimeout = async (strategy: () => Promise<void>, timeout: number) => {
|
||||||
|
return Promise.race([
|
||||||
|
strategy(),
|
||||||
|
new Promise<void>((resolve) => setTimeout(resolve, timeout))
|
||||||
|
]);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Execute all strategies in parallel with timeouts
|
||||||
|
await Promise.allSettled([
|
||||||
|
executeWithTimeout(loadingStrategies[0]!, 3000),
|
||||||
|
executeWithTimeout(loadingStrategies[1]!, 8000),
|
||||||
|
executeWithTimeout(loadingStrategies[2]!, 5000),
|
||||||
|
executeWithTimeout(loadingStrategies[3]!, 3000)
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Final wait for any remaining dynamic content
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.log('Dynamic content handling failed, using basic timeout:', error);
|
||||||
|
// If dynamic content handling fails, continue with basic timeout
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
}
|
||||||
|
console.log('Dynamic content handling completed.');
|
||||||
|
}
|
||||||
|
|
||||||
async function getBrowser(): Promise<Browser> {
|
async function getBrowser(): Promise<Browser> {
|
||||||
if (!sharedBrowser || !sharedBrowser.isConnected()) {
|
if (!sharedBrowser || !sharedBrowser.isConnected()) {
|
||||||
sharedBrowser = await puppeteer.launch({
|
sharedBrowser = await puppeteer.launch({
|
||||||
@ -41,6 +157,7 @@ export async function closeBrowser(): Promise<void> {
|
|||||||
export async function extractArticleContent(
|
export async function extractArticleContent(
|
||||||
url: string,
|
url: string,
|
||||||
): Promise<ExtractedContent> {
|
): Promise<ExtractedContent> {
|
||||||
|
console.log(`Starting content extraction for: ${url}`);
|
||||||
let page = null;
|
let page = null;
|
||||||
try {
|
try {
|
||||||
const browser = await getBrowser();
|
const browser = await getBrowser();
|
||||||
@ -52,155 +169,352 @@ export async function extractArticleContent(
|
|||||||
);
|
);
|
||||||
await page.setViewport({ width: 1280, height: 720 });
|
await page.setViewport({ width: 1280, height: 720 });
|
||||||
|
|
||||||
// Set navigation timeout
|
// Set navigation timeout and disable images for faster loading
|
||||||
page.setDefaultNavigationTimeout(30000);
|
page.setDefaultNavigationTimeout(45000);
|
||||||
page.setDefaultTimeout(30000);
|
page.setDefaultTimeout(45000);
|
||||||
|
|
||||||
// Navigate to the page
|
// Block unnecessary resources to speed up loading
|
||||||
|
await page.setRequestInterception(true);
|
||||||
|
page.on('request', (req) => {
|
||||||
|
const resourceType = req.resourceType();
|
||||||
|
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
|
||||||
|
req.abort();
|
||||||
|
} else {
|
||||||
|
req.continue();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Navigate to the page with better waiting strategy
|
||||||
const response = await page.goto(url, {
|
const response = await page.goto(url, {
|
||||||
waitUntil: "networkidle2",
|
waitUntil: "domcontentloaded",
|
||||||
timeout: 30000,
|
timeout: 45000,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response || !response.ok()) {
|
if (!response || !response.ok()) {
|
||||||
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for potential dynamic content
|
// Enhanced dynamic content handling
|
||||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
console.log('Handling dynamic content...');
|
||||||
|
await handleDynamicContent(page);
|
||||||
|
|
||||||
// Extract content using page.evaluate
|
// Extract content using advanced multi-strategy approach
|
||||||
|
console.log('Extracting content using multi-strategy approach...');
|
||||||
const extractedData = await page.evaluate(() => {
|
const extractedData = await page.evaluate(() => {
|
||||||
// Remove unwanted elements
|
interface ContentCandidate {
|
||||||
|
element: Element;
|
||||||
|
score: number;
|
||||||
|
content: string;
|
||||||
|
selector: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove unwanted elements first
|
||||||
const unwantedSelectors = [
|
const unwantedSelectors = [
|
||||||
"script",
|
"script", "style", "noscript", "iframe", "embed", "object",
|
||||||
"style",
|
"nav", "header", "footer", "aside", "form",
|
||||||
"nav",
|
".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
|
||||||
"header",
|
".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
|
||||||
"footer",
|
".social-share", ".share", ".social", ".sns",
|
||||||
"aside",
|
".comments", ".comment", ".disqus",
|
||||||
".advertisement",
|
".cookie-banner", ".cookie", ".gdpr",
|
||||||
".ads",
|
".popup", ".modal", ".overlay", ".lightbox",
|
||||||
".sidebar",
|
".related", ".recommended", ".more-stories",
|
||||||
".menu",
|
".tags", ".categories", ".metadata",
|
||||||
".navigation",
|
".author-bio", ".author-info",
|
||||||
".social-share",
|
".newsletter", ".subscribe", ".signup",
|
||||||
".comments",
|
"[role='complementary']", "[role='banner']", "[role='contentinfo']",
|
||||||
".cookie-banner",
|
"[aria-label*='advertisement']", "[aria-label*='sidebar']"
|
||||||
".popup",
|
|
||||||
".modal",
|
|
||||||
];
|
];
|
||||||
|
|
||||||
unwantedSelectors.forEach((selector) => {
|
unwantedSelectors.forEach((selector) => {
|
||||||
const elements = document.querySelectorAll(selector);
|
try {
|
||||||
elements.forEach((el) => el.remove());
|
const elements = document.querySelectorAll(selector);
|
||||||
|
elements.forEach((el) => el.remove());
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore invalid selectors
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let content = "";
|
|
||||||
let title = "";
|
|
||||||
let description = "";
|
|
||||||
|
|
||||||
// Extract title
|
// Extract title
|
||||||
const titleElement = document.querySelector("title");
|
let title = "";
|
||||||
const h1Element = document.querySelector("h1");
|
const titleSources = [
|
||||||
const ogTitleMeta = document.querySelector('meta[property="og:title"]');
|
() => document.querySelector('meta[property="og:title"]')?.getAttribute('content'),
|
||||||
|
() => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'),
|
||||||
title =
|
() => document.querySelector('h1')?.textContent?.trim(),
|
||||||
titleElement?.textContent?.trim() ||
|
() => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(),
|
||||||
h1Element?.textContent?.trim() ||
|
() => document.querySelector('title')?.textContent?.trim(),
|
||||||
ogTitleMeta?.getAttribute("content") ||
|
() => document.querySelector('[itemprop="headline"]')?.textContent?.trim()
|
||||||
"";
|
|
||||||
|
|
||||||
// Extract description
|
|
||||||
const descriptionMeta = document.querySelector(
|
|
||||||
'meta[name="description"]',
|
|
||||||
);
|
|
||||||
const ogDescriptionMeta = document.querySelector(
|
|
||||||
'meta[property="og:description"]',
|
|
||||||
);
|
|
||||||
|
|
||||||
description =
|
|
||||||
descriptionMeta?.getAttribute("content") ||
|
|
||||||
ogDescriptionMeta?.getAttribute("content") ||
|
|
||||||
"";
|
|
||||||
|
|
||||||
// Try multiple content extraction strategies
|
|
||||||
const contentSelectors = [
|
|
||||||
// Common article selectors
|
|
||||||
"article",
|
|
||||||
'[role="main"]',
|
|
||||||
".article-content",
|
|
||||||
".post-content",
|
|
||||||
".entry-content",
|
|
||||||
".content",
|
|
||||||
".main-content",
|
|
||||||
".article-body",
|
|
||||||
".post-body",
|
|
||||||
".story-body",
|
|
||||||
".news-content",
|
|
||||||
|
|
||||||
// Japanese news site specific selectors
|
|
||||||
".article",
|
|
||||||
".news-article",
|
|
||||||
".post",
|
|
||||||
".entry",
|
|
||||||
"#content",
|
|
||||||
"#main",
|
|
||||||
".main",
|
|
||||||
|
|
||||||
// Fallback to common containers
|
|
||||||
".container",
|
|
||||||
"#container",
|
|
||||||
"main",
|
|
||||||
"body",
|
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const selector of contentSelectors) {
|
for (const source of titleSources) {
|
||||||
const element = document.querySelector(selector);
|
try {
|
||||||
if (element) {
|
const result = source();
|
||||||
// Get text content and clean it up
|
if (result && result.length > 0) {
|
||||||
let extractedText = element.textContent?.trim() || "";
|
title = result;
|
||||||
|
|
||||||
// Remove extra whitespace and normalize
|
|
||||||
extractedText = extractedText
|
|
||||||
.replace(/\s+/g, " ")
|
|
||||||
.replace(/\n\s*\n/g, "\n")
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
// Only use if we found substantial content
|
|
||||||
if (extractedText.length > 200) {
|
|
||||||
content = extractedText;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract description
|
||||||
|
let description = "";
|
||||||
|
const descriptionSources = [
|
||||||
|
() => document.querySelector('meta[property="og:description"]')?.getAttribute('content'),
|
||||||
|
() => document.querySelector('meta[name="description"]')?.getAttribute('content'),
|
||||||
|
() => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'),
|
||||||
|
() => document.querySelector('[itemprop="description"]')?.textContent?.trim()
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const source of descriptionSources) {
|
||||||
|
try {
|
||||||
|
const result = source();
|
||||||
|
if (result && result.length > 0) {
|
||||||
|
description = result;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Comprehensive content selectors with priorities
|
||||||
|
const contentSelectors = [
|
||||||
|
// Schema.org and structured data
|
||||||
|
'[itemtype*="Article"] [itemprop="articleBody"]',
|
||||||
|
'[itemtype*="NewsArticle"] [itemprop="articleBody"]',
|
||||||
|
'[itemtype*="BlogPosting"] [itemprop="articleBody"]',
|
||||||
|
|
||||||
|
// High-priority semantic selectors
|
||||||
|
'article[role="main"]',
|
||||||
|
'main article',
|
||||||
|
'[role="main"] article',
|
||||||
|
'article',
|
||||||
|
|
||||||
|
// Common CMS and platform selectors
|
||||||
|
'.post-content', '.entry-content', '.article-content', '.content-area',
|
||||||
|
'.article-body', '.post-body', '.entry-body', '.story-body',
|
||||||
|
'.main-content', '.primary-content', '.page-content',
|
||||||
|
'.news-content', '.blog-content', '.editorial-content',
|
||||||
|
|
||||||
|
// WordPress specific
|
||||||
|
'.wp-content', '.entry', '.post',
|
||||||
|
|
||||||
|
// Medium, Substack, Ghost
|
||||||
|
'.section-content', '.postArticle-content', '.post-full-content',
|
||||||
|
'.markup', '.section--body', '.section-divider + .section-content',
|
||||||
|
|
||||||
|
// Japanese sites specific
|
||||||
|
'.honbun', '.main_text', '.article_body', '.news_body',
|
||||||
|
'.entry_text', '.blog_text', '.content_text',
|
||||||
|
'.kiji', '.news', '.article',
|
||||||
|
|
||||||
|
// Generic semantic HTML5
|
||||||
|
'main', '[role="main"]',
|
||||||
|
|
||||||
|
// ID-based selectors
|
||||||
|
'#content', '#main', '#article', '#post', '#entry',
|
||||||
|
'#main-content', '#primary', '#content-area',
|
||||||
|
|
||||||
|
// Class-based common patterns
|
||||||
|
'.content', '.main', '.wrapper', '.container',
|
||||||
|
|
||||||
|
// Fallbacks
|
||||||
|
'body'
|
||||||
|
];
|
||||||
|
|
||||||
|
// Function to calculate content quality score
|
||||||
|
function calculateContentScore(element: Element): number {
|
||||||
|
if (!element) return 0;
|
||||||
|
|
||||||
|
const text = element.textContent || '';
|
||||||
|
if (text.length < 100) return 0;
|
||||||
|
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Base score from text length (diminishing returns)
|
||||||
|
score += Math.min(text.length / 100, 50);
|
||||||
|
|
||||||
|
// Paragraph density
|
||||||
|
const paragraphs = element.querySelectorAll('p');
|
||||||
|
const avgParagraphLength = paragraphs.length > 0 ?
|
||||||
|
Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0;
|
||||||
|
|
||||||
|
if (avgParagraphLength > 100) score += 20;
|
||||||
|
if (paragraphs.length > 3) score += 10;
|
||||||
|
|
||||||
|
// Link density penalty (articles shouldn't be mostly links)
|
||||||
|
const links = element.querySelectorAll('a');
|
||||||
|
const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
|
||||||
|
const linkDensity = text.length > 0 ? linkText / text.length : 0;
|
||||||
|
if (linkDensity < 0.2) score += 15;
|
||||||
|
else if (linkDensity < 0.4) score += 5;
|
||||||
|
else score -= 10;
|
||||||
|
|
||||||
|
// Bonus for article-like structure
|
||||||
|
if (element.tagName === 'ARTICLE') score += 25;
|
||||||
|
if (element.getAttribute('role') === 'main') score += 20;
|
||||||
|
if (element.querySelector('h1, h2, h3')) score += 10;
|
||||||
|
|
||||||
|
// Bonus for semantic elements
|
||||||
|
const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
|
||||||
|
if (semanticElements.length > 5) score += 15;
|
||||||
|
|
||||||
|
// Penalty for too many images without text
|
||||||
|
const images = element.querySelectorAll('img');
|
||||||
|
if (images.length > text.length / 500) score -= 5;
|
||||||
|
|
||||||
|
// Penalty for navigation-like content
|
||||||
|
const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
|
||||||
|
const className = element.className.toLowerCase();
|
||||||
|
const id = element.id.toLowerCase();
|
||||||
|
if (navWords.some(word => className.includes(word) || id.includes(word))) {
|
||||||
|
score -= 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(score, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to clean and normalize text
|
||||||
|
function cleanText(text: string): string {
|
||||||
|
return text
|
||||||
|
.replace(/\s+/g, ' ') // Normalize whitespace
|
||||||
|
.replace(/\n\s*\n\s*\n/g, '\n\n') // Reduce excessive line breaks
|
||||||
|
.replace(/^\s+|\s+$/g, '') // Trim
|
||||||
|
.replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width characters
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect and score all content candidates
|
||||||
|
const candidates: ContentCandidate[] = [];
|
||||||
|
|
||||||
|
for (const selector of contentSelectors) {
|
||||||
|
try {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
elements.forEach((element, index) => {
|
||||||
|
const text = element.textContent || '';
|
||||||
|
if (text.length > 200) { // Minimum content threshold
|
||||||
|
const score = calculateContentScore(element);
|
||||||
|
candidates.push({
|
||||||
|
element,
|
||||||
|
score,
|
||||||
|
content: cleanText(text),
|
||||||
|
selector: `${selector}[${index}]`
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
// Skip invalid selectors
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort candidates by score (highest first)
|
||||||
|
candidates.sort((a, b) => b.score - a.score);
|
||||||
|
|
||||||
|
console.log(`Found ${candidates.length} content candidates`);
|
||||||
|
if (candidates.length > 0) {
|
||||||
|
console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the best content
|
||||||
|
let content = "";
|
||||||
|
if (candidates.length > 0) {
|
||||||
|
content = candidates[0].content;
|
||||||
|
|
||||||
|
// If the best candidate is still short, try combining top candidates
|
||||||
|
if (content.length < 500 && candidates.length > 1) {
|
||||||
|
const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
|
||||||
|
const combinedContent = topCandidates.map(c => c.content).join('\n\n');
|
||||||
|
if (combinedContent.length > content.length) {
|
||||||
|
content = cleanText(combinedContent);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If still no content, try paragraph extraction
|
// Fallback strategies if still no good content
|
||||||
if (!content) {
|
if (!content || content.length < 200) {
|
||||||
const paragraphs = Array.from(document.querySelectorAll("p"))
|
// Try paragraph aggregation
|
||||||
.map((p) => p.textContent?.trim() || "")
|
const paragraphs = Array.from(document.querySelectorAll('p'))
|
||||||
.filter((p) => p.length > 50); // Filter out short paragraphs
|
.map(p => p.textContent?.trim() || '')
|
||||||
content = paragraphs.join("\n\n");
|
.filter(p => p.length > 50)
|
||||||
|
.filter(p => {
|
||||||
|
// Filter out likely navigation/boilerplate paragraphs
|
||||||
|
const lowerP = p.toLowerCase();
|
||||||
|
return !lowerP.includes('cookie') &&
|
||||||
|
!lowerP.includes('privacy') &&
|
||||||
|
!lowerP.includes('terms of service') &&
|
||||||
|
!lowerP.includes('subscribe') &&
|
||||||
|
!lowerP.includes('newsletter');
|
||||||
|
});
|
||||||
|
|
||||||
|
if (paragraphs.length > 0) {
|
||||||
|
content = cleanText(paragraphs.join('\n\n'));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Final fallback: use body text
|
// Final fallback: structured data
|
||||||
if (!content || content.length < 100) {
|
if (!content || content.length < 200) {
|
||||||
const bodyText = document.body?.textContent || "";
|
try {
|
||||||
content = bodyText.replace(/\s+/g, " ").trim();
|
const jsonLd = document.querySelector('script[type="application/ld+json"]');
|
||||||
|
if (jsonLd) {
|
||||||
|
const data = JSON.parse(jsonLd.textContent || '{}');
|
||||||
|
if (data.articleBody) {
|
||||||
|
content = cleanText(data.articleBody);
|
||||||
|
} else if (data.text) {
|
||||||
|
content = cleanText(data.text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore JSON parsing errors
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(`Final content length: ${content.length} characters`);
|
||||||
return { title, content, description };
|
return { title, content, description };
|
||||||
});
|
});
|
||||||
|
|
||||||
// Validate extracted content
|
// Validate extracted content with more lenient threshold
|
||||||
if (!extractedData.content || extractedData.content.length < 50) {
|
if (!extractedData.content || extractedData.content.length < 100) {
|
||||||
return {
|
// Try one more extraction attempt with relaxed criteria
|
||||||
title: extractedData.title,
|
const fallbackData = await page.evaluate(() => {
|
||||||
content: "",
|
// Last resort: extract all text from body, excluding common noise
|
||||||
description: extractedData.description,
|
const body = document.body;
|
||||||
success: false,
|
if (body) {
|
||||||
error: "Insufficient content extracted",
|
// Clone body to avoid modifying original
|
||||||
};
|
const bodyClone = body.cloneNode(true) as Element;
|
||||||
|
|
||||||
|
// Remove noise elements from clone
|
||||||
|
const noiseSelectors = [
|
||||||
|
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
||||||
|
'.ad', '.ads', '.advertisement', '[class*="ad-"]',
|
||||||
|
'.menu', '.navigation', '.sidebar', '.social',
|
||||||
|
'.cookie', '.popup', '.modal'
|
||||||
|
];
|
||||||
|
|
||||||
|
noiseSelectors.forEach(selector => {
|
||||||
|
const elements = bodyClone.querySelectorAll(selector);
|
||||||
|
elements.forEach(el => el.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
const text = bodyClone.textContent || '';
|
||||||
|
return text.replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
return '';
|
||||||
|
});
|
||||||
|
|
||||||
|
if (fallbackData && fallbackData.length > 200) {
|
||||||
|
extractedData.content = fallbackData;
|
||||||
|
} else {
|
||||||
|
return {
|
||||||
|
title: extractedData.title,
|
||||||
|
content: extractedData.content || "",
|
||||||
|
description: extractedData.description,
|
||||||
|
success: false,
|
||||||
|
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Limit content length to avoid token limits
|
// Limit content length to avoid token limits
|
||||||
@ -210,6 +524,7 @@ export async function extractArticleContent(
|
|||||||
content = content.substring(0, maxLength) + "...";
|
content = content.substring(0, maxLength) + "...";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(`Successfully extracted content: ${content.length} characters`);
|
||||||
return {
|
return {
|
||||||
title: extractedData.title,
|
title: extractedData.title,
|
||||||
content,
|
content,
|
||||||
@ -217,6 +532,7 @@ export async function extractArticleContent(
|
|||||||
success: true,
|
success: true,
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
console.error(`Content extraction failed for ${url}:`, error);
|
||||||
return {
|
return {
|
||||||
title: "",
|
title: "",
|
||||||
content: "",
|
content: "",
|
||||||
|
Reference in New Issue
Block a user