Files
VoiceRSSSummary/services/content-extractor.ts

263 lines
6.9 KiB
TypeScript

import puppeteer, { type Browser } from "puppeteer";
export interface ExtractedContent {
title?: string;
content: string;
description?: string;
success: boolean;
error?: string;
}
// Singleton browser instance for reuse
let sharedBrowser: Browser | null = null;
async function getBrowser(): Promise<Browser> {
if (!sharedBrowser || !sharedBrowser.isConnected()) {
sharedBrowser = await puppeteer.launch({
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--no-first-run",
"--no-zygote",
"--disable-gpu",
"--disable-web-security",
"--disable-features=VizDisplayCompositor",
],
});
}
return sharedBrowser;
}
export async function closeBrowser(): Promise<void> {
if (sharedBrowser && sharedBrowser.isConnected()) {
await sharedBrowser.close();
sharedBrowser = null;
}
}
export async function extractArticleContent(
url: string,
): Promise<ExtractedContent> {
let page = null;
try {
const browser = await getBrowser();
page = await browser.newPage();
// Set user agent and viewport
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
);
await page.setViewport({ width: 1280, height: 720 });
// Set navigation timeout
page.setDefaultNavigationTimeout(30000);
page.setDefaultTimeout(30000);
// Navigate to the page
const response = await page.goto(url, {
waitUntil: "networkidle2",
timeout: 30000,
});
if (!response || !response.ok()) {
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
}
// Wait for potential dynamic content
await new Promise(resolve => setTimeout(resolve, 2000));
// Extract content using page.evaluate
const extractedData = await page.evaluate(() => {
// Remove unwanted elements
const unwantedSelectors = [
"script",
"style",
"nav",
"header",
"footer",
"aside",
".advertisement",
".ads",
".sidebar",
".menu",
".navigation",
".social-share",
".comments",
".cookie-banner",
".popup",
".modal",
];
unwantedSelectors.forEach((selector) => {
const elements = document.querySelectorAll(selector);
elements.forEach((el) => el.remove());
});
let content = "";
let title = "";
let description = "";
// Extract title
const titleElement = document.querySelector("title");
const h1Element = document.querySelector("h1");
const ogTitleMeta = document.querySelector('meta[property="og:title"]');
title =
titleElement?.textContent?.trim() ||
h1Element?.textContent?.trim() ||
ogTitleMeta?.getAttribute("content") ||
"";
// Extract description
const descriptionMeta = document.querySelector('meta[name="description"]');
const ogDescriptionMeta = document.querySelector(
'meta[property="og:description"]',
);
description =
descriptionMeta?.getAttribute("content") ||
ogDescriptionMeta?.getAttribute("content") ||
"";
// Try multiple content extraction strategies
const contentSelectors = [
// Common article selectors
"article",
'[role="main"]',
".article-content",
".post-content",
".entry-content",
".content",
".main-content",
".article-body",
".post-body",
".story-body",
".news-content",
// Japanese news site specific selectors
".article",
".news-article",
".post",
".entry",
"#content",
"#main",
".main",
// Fallback to common containers
".container",
"#container",
"main",
"body",
];
for (const selector of contentSelectors) {
const element = document.querySelector(selector);
if (element) {
// Get text content and clean it up
let extractedText = element.textContent?.trim() || "";
// Remove extra whitespace and normalize
extractedText = extractedText
.replace(/\s+/g, " ")
.replace(/\n\s*\n/g, "\n")
.trim();
// Only use if we found substantial content
if (extractedText.length > 200) {
content = extractedText;
break;
}
}
}
// If still no content, try paragraph extraction
if (!content) {
const paragraphs = Array.from(document.querySelectorAll("p"))
.map((p) => p.textContent?.trim() || "")
.filter((p) => p.length > 50); // Filter out short paragraphs
content = paragraphs.join("\n\n");
}
// Final fallback: use body text
if (!content || content.length < 100) {
const bodyText = document.body?.textContent || "";
content = bodyText.replace(/\s+/g, " ").trim();
}
return { title, content, description };
});
// Validate extracted content
if (!extractedData.content || extractedData.content.length < 50) {
return {
title: extractedData.title,
content: "",
description: extractedData.description,
success: false,
error: "Insufficient content extracted",
};
}
// Limit content length to avoid token limits
const maxLength = 5000;
let content = extractedData.content;
if (content.length > maxLength) {
content = content.substring(0, maxLength) + "...";
}
return {
title: extractedData.title,
content,
description: extractedData.description,
success: true,
};
} catch (error) {
return {
title: "",
content: "",
description: "",
success: false,
error: error instanceof Error ? error.message : "Unknown error occurred",
};
} finally {
if (page) {
await page.close();
}
}
}
export async function enhanceArticleContent(
originalTitle: string,
originalLink: string,
originalContent?: string,
originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
// If we already have substantial content, use it
const existingContent = originalContent || originalDescription || "";
if (existingContent.length > 500) {
return {
content: originalContent,
description: originalDescription,
};
}
// Try to extract content from the URL
const extracted = await extractArticleContent(originalLink);
if (extracted.success && extracted.content) {
return {
content: extracted.content,
description: extracted.description || originalDescription,
};
}
// Return original content if extraction failed
return {
content: originalContent,
description: originalDescription,
};
}