263 lines
6.9 KiB
TypeScript
263 lines
6.9 KiB
TypeScript
import puppeteer, { type Browser } from "puppeteer";
|
|
|
|
export interface ExtractedContent {
|
|
title?: string;
|
|
content: string;
|
|
description?: string;
|
|
success: boolean;
|
|
error?: string;
|
|
}
|
|
|
|
// Singleton browser instance for reuse
|
|
let sharedBrowser: Browser | null = null;
|
|
|
|
async function getBrowser(): Promise<Browser> {
|
|
if (!sharedBrowser || !sharedBrowser.isConnected()) {
|
|
sharedBrowser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--disable-accelerated-2d-canvas",
|
|
"--no-first-run",
|
|
"--no-zygote",
|
|
"--disable-gpu",
|
|
"--disable-web-security",
|
|
"--disable-features=VizDisplayCompositor",
|
|
],
|
|
});
|
|
}
|
|
return sharedBrowser;
|
|
}
|
|
|
|
export async function closeBrowser(): Promise<void> {
|
|
if (sharedBrowser && sharedBrowser.isConnected()) {
|
|
await sharedBrowser.close();
|
|
sharedBrowser = null;
|
|
}
|
|
}
|
|
|
|
export async function extractArticleContent(
|
|
url: string,
|
|
): Promise<ExtractedContent> {
|
|
let page = null;
|
|
try {
|
|
const browser = await getBrowser();
|
|
page = await browser.newPage();
|
|
|
|
// Set user agent and viewport
|
|
await page.setUserAgent(
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
);
|
|
await page.setViewport({ width: 1280, height: 720 });
|
|
|
|
// Set navigation timeout
|
|
page.setDefaultNavigationTimeout(30000);
|
|
page.setDefaultTimeout(30000);
|
|
|
|
// Navigate to the page
|
|
const response = await page.goto(url, {
|
|
waitUntil: "networkidle2",
|
|
timeout: 30000,
|
|
});
|
|
|
|
if (!response || !response.ok()) {
|
|
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
|
}
|
|
|
|
// Wait for potential dynamic content
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
|
|
// Extract content using page.evaluate
|
|
const extractedData = await page.evaluate(() => {
|
|
// Remove unwanted elements
|
|
const unwantedSelectors = [
|
|
"script",
|
|
"style",
|
|
"nav",
|
|
"header",
|
|
"footer",
|
|
"aside",
|
|
".advertisement",
|
|
".ads",
|
|
".sidebar",
|
|
".menu",
|
|
".navigation",
|
|
".social-share",
|
|
".comments",
|
|
".cookie-banner",
|
|
".popup",
|
|
".modal",
|
|
];
|
|
|
|
unwantedSelectors.forEach((selector) => {
|
|
const elements = document.querySelectorAll(selector);
|
|
elements.forEach((el) => el.remove());
|
|
});
|
|
|
|
let content = "";
|
|
let title = "";
|
|
let description = "";
|
|
|
|
// Extract title
|
|
const titleElement = document.querySelector("title");
|
|
const h1Element = document.querySelector("h1");
|
|
const ogTitleMeta = document.querySelector('meta[property="og:title"]');
|
|
|
|
title =
|
|
titleElement?.textContent?.trim() ||
|
|
h1Element?.textContent?.trim() ||
|
|
ogTitleMeta?.getAttribute("content") ||
|
|
"";
|
|
|
|
// Extract description
|
|
const descriptionMeta = document.querySelector('meta[name="description"]');
|
|
const ogDescriptionMeta = document.querySelector(
|
|
'meta[property="og:description"]',
|
|
);
|
|
|
|
description =
|
|
descriptionMeta?.getAttribute("content") ||
|
|
ogDescriptionMeta?.getAttribute("content") ||
|
|
"";
|
|
|
|
// Try multiple content extraction strategies
|
|
const contentSelectors = [
|
|
// Common article selectors
|
|
"article",
|
|
'[role="main"]',
|
|
".article-content",
|
|
".post-content",
|
|
".entry-content",
|
|
".content",
|
|
".main-content",
|
|
".article-body",
|
|
".post-body",
|
|
".story-body",
|
|
".news-content",
|
|
|
|
// Japanese news site specific selectors
|
|
".article",
|
|
".news-article",
|
|
".post",
|
|
".entry",
|
|
"#content",
|
|
"#main",
|
|
".main",
|
|
|
|
// Fallback to common containers
|
|
".container",
|
|
"#container",
|
|
"main",
|
|
"body",
|
|
];
|
|
|
|
for (const selector of contentSelectors) {
|
|
const element = document.querySelector(selector);
|
|
if (element) {
|
|
// Get text content and clean it up
|
|
let extractedText = element.textContent?.trim() || "";
|
|
|
|
// Remove extra whitespace and normalize
|
|
extractedText = extractedText
|
|
.replace(/\s+/g, " ")
|
|
.replace(/\n\s*\n/g, "\n")
|
|
.trim();
|
|
|
|
// Only use if we found substantial content
|
|
if (extractedText.length > 200) {
|
|
content = extractedText;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If still no content, try paragraph extraction
|
|
if (!content) {
|
|
const paragraphs = Array.from(document.querySelectorAll("p"))
|
|
.map((p) => p.textContent?.trim() || "")
|
|
.filter((p) => p.length > 50); // Filter out short paragraphs
|
|
content = paragraphs.join("\n\n");
|
|
}
|
|
|
|
// Final fallback: use body text
|
|
if (!content || content.length < 100) {
|
|
const bodyText = document.body?.textContent || "";
|
|
content = bodyText.replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
return { title, content, description };
|
|
});
|
|
|
|
// Validate extracted content
|
|
if (!extractedData.content || extractedData.content.length < 50) {
|
|
return {
|
|
title: extractedData.title,
|
|
content: "",
|
|
description: extractedData.description,
|
|
success: false,
|
|
error: "Insufficient content extracted",
|
|
};
|
|
}
|
|
|
|
// Limit content length to avoid token limits
|
|
const maxLength = 5000;
|
|
let content = extractedData.content;
|
|
if (content.length > maxLength) {
|
|
content = content.substring(0, maxLength) + "...";
|
|
}
|
|
|
|
return {
|
|
title: extractedData.title,
|
|
content,
|
|
description: extractedData.description,
|
|
success: true,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
title: "",
|
|
content: "",
|
|
description: "",
|
|
success: false,
|
|
error: error instanceof Error ? error.message : "Unknown error occurred",
|
|
};
|
|
} finally {
|
|
if (page) {
|
|
await page.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function enhanceArticleContent(
|
|
_originalTitle: string,
|
|
originalLink: string,
|
|
originalContent?: string,
|
|
originalDescription?: string,
|
|
): Promise<{ content?: string; description?: string }> {
|
|
// If we already have substantial content, use it
|
|
const existingContent = originalContent || originalDescription || "";
|
|
if (existingContent.length > 500) {
|
|
return {
|
|
content: originalContent,
|
|
description: originalDescription,
|
|
};
|
|
}
|
|
|
|
// Try to extract content from the URL
|
|
const extracted = await extractArticleContent(originalLink);
|
|
|
|
if (extracted.success && extracted.content) {
|
|
return {
|
|
content: extracted.content,
|
|
description: extracted.description || originalDescription,
|
|
};
|
|
}
|
|
|
|
// Return original content if extraction failed
|
|
return {
|
|
content: originalContent,
|
|
description: originalDescription,
|
|
};
|
|
}
|