Replace cheero with puppeteer
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
import * as cheerio from "cheerio";
|
||||
import puppeteer, { type Browser } from "puppeteer";
|
||||
|
||||
export interface ExtractedContent {
|
||||
title?: string;
|
||||
@ -8,126 +8,194 @@ export interface ExtractedContent {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// Singleton browser instance for reuse
|
||||
let sharedBrowser: Browser | null = null;
|
||||
|
||||
async function getBrowser(): Promise<Browser> {
|
||||
if (!sharedBrowser || !sharedBrowser.isConnected()) {
|
||||
sharedBrowser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-accelerated-2d-canvas",
|
||||
"--no-first-run",
|
||||
"--no-zygote",
|
||||
"--disable-gpu",
|
||||
"--disable-web-security",
|
||||
"--disable-features=VizDisplayCompositor",
|
||||
],
|
||||
});
|
||||
}
|
||||
return sharedBrowser;
|
||||
}
|
||||
|
||||
export async function closeBrowser(): Promise<void> {
|
||||
if (sharedBrowser && sharedBrowser.isConnected()) {
|
||||
await sharedBrowser.close();
|
||||
sharedBrowser = null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractArticleContent(
|
||||
url: string,
|
||||
): Promise<ExtractedContent> {
|
||||
let page = null;
|
||||
try {
|
||||
// Fetch the HTML content
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
Accept:
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
Connection: "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
},
|
||||
signal: AbortSignal.timeout(30000), // 30 second timeout
|
||||
const browser = await getBrowser();
|
||||
page = await browser.newPage();
|
||||
|
||||
// Set user agent and viewport
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
);
|
||||
await page.setViewport({ width: 1280, height: 720 });
|
||||
|
||||
// Set navigation timeout
|
||||
page.setDefaultNavigationTimeout(30000);
|
||||
page.setDefaultTimeout(30000);
|
||||
|
||||
// Navigate to the page
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: "networkidle2",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
if (!response || !response.ok()) {
|
||||
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
const $ = cheerio.load(html);
|
||||
// Wait for potential dynamic content
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
// Remove unwanted elements
|
||||
$(
|
||||
"script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
|
||||
).remove();
|
||||
// Extract content using page.evaluate
|
||||
const extractedData = await page.evaluate(() => {
|
||||
// Remove unwanted elements
|
||||
const unwantedSelectors = [
|
||||
"script",
|
||||
"style",
|
||||
"nav",
|
||||
"header",
|
||||
"footer",
|
||||
"aside",
|
||||
".advertisement",
|
||||
".ads",
|
||||
".sidebar",
|
||||
".menu",
|
||||
".navigation",
|
||||
".social-share",
|
||||
".comments",
|
||||
".cookie-banner",
|
||||
".popup",
|
||||
".modal",
|
||||
];
|
||||
|
||||
let content = "";
|
||||
let title = "";
|
||||
let description = "";
|
||||
unwantedSelectors.forEach((selector) => {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
elements.forEach((el) => el.remove());
|
||||
});
|
||||
|
||||
// Extract title
|
||||
title =
|
||||
$("title").text().trim() ||
|
||||
$("h1").first().text().trim() ||
|
||||
$('meta[property="og:title"]').attr("content") ||
|
||||
"";
|
||||
let content = "";
|
||||
let title = "";
|
||||
let description = "";
|
||||
|
||||
// Extract description
|
||||
description =
|
||||
$('meta[name="description"]').attr("content") ||
|
||||
$('meta[property="og:description"]').attr("content") ||
|
||||
"";
|
||||
// Extract title
|
||||
const titleElement = document.querySelector("title");
|
||||
const h1Element = document.querySelector("h1");
|
||||
const ogTitleMeta = document.querySelector('meta[property="og:title"]');
|
||||
|
||||
// Try multiple content extraction strategies
|
||||
const contentSelectors = [
|
||||
// Common article selectors
|
||||
"article",
|
||||
'[role="main"]',
|
||||
".article-content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".content",
|
||||
".main-content",
|
||||
".article-body",
|
||||
".post-body",
|
||||
".story-body",
|
||||
".news-content",
|
||||
title =
|
||||
titleElement?.textContent?.trim() ||
|
||||
h1Element?.textContent?.trim() ||
|
||||
ogTitleMeta?.getAttribute("content") ||
|
||||
"";
|
||||
|
||||
// Japanese news site specific selectors
|
||||
".article",
|
||||
".news-article",
|
||||
".post",
|
||||
".entry",
|
||||
"#content",
|
||||
"#main",
|
||||
".main",
|
||||
// Extract description
|
||||
const descriptionMeta = document.querySelector('meta[name="description"]');
|
||||
const ogDescriptionMeta = document.querySelector(
|
||||
'meta[property="og:description"]',
|
||||
);
|
||||
|
||||
// Fallback to common containers
|
||||
".container",
|
||||
"#container",
|
||||
"main",
|
||||
"body",
|
||||
];
|
||||
description =
|
||||
descriptionMeta?.getAttribute("content") ||
|
||||
ogDescriptionMeta?.getAttribute("content") ||
|
||||
"";
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const element = $(selector);
|
||||
if (element.length > 0) {
|
||||
// Get text content and clean it up
|
||||
let extractedText = element.text().trim();
|
||||
// Try multiple content extraction strategies
|
||||
const contentSelectors = [
|
||||
// Common article selectors
|
||||
"article",
|
||||
'[role="main"]',
|
||||
".article-content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".content",
|
||||
".main-content",
|
||||
".article-body",
|
||||
".post-body",
|
||||
".story-body",
|
||||
".news-content",
|
||||
|
||||
// Remove extra whitespace and normalize
|
||||
extractedText = extractedText
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\n\s*\n/g, "\n")
|
||||
.trim();
|
||||
// Japanese news site specific selectors
|
||||
".article",
|
||||
".news-article",
|
||||
".post",
|
||||
".entry",
|
||||
"#content",
|
||||
"#main",
|
||||
".main",
|
||||
|
||||
// Only use if we found substantial content
|
||||
if (extractedText.length > 200) {
|
||||
content = extractedText;
|
||||
break;
|
||||
// Fallback to common containers
|
||||
".container",
|
||||
"#container",
|
||||
"main",
|
||||
"body",
|
||||
];
|
||||
|
||||
for (const selector of contentSelectors) {
|
||||
const element = document.querySelector(selector);
|
||||
if (element) {
|
||||
// Get text content and clean it up
|
||||
let extractedText = element.textContent?.trim() || "";
|
||||
|
||||
// Remove extra whitespace and normalize
|
||||
extractedText = extractedText
|
||||
.replace(/\s+/g, " ")
|
||||
.replace(/\n\s*\n/g, "\n")
|
||||
.trim();
|
||||
|
||||
// Only use if we found substantial content
|
||||
if (extractedText.length > 200) {
|
||||
content = extractedText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If still no content, try paragraph extraction
|
||||
if (!content) {
|
||||
const paragraphs = $("p")
|
||||
.map((_, el) => $(el).text().trim())
|
||||
.get();
|
||||
content = paragraphs
|
||||
.filter((p) => p.length > 50) // Filter out short paragraphs
|
||||
.join("\n\n");
|
||||
}
|
||||
// If still no content, try paragraph extraction
|
||||
if (!content) {
|
||||
const paragraphs = Array.from(document.querySelectorAll("p"))
|
||||
.map((p) => p.textContent?.trim() || "")
|
||||
.filter((p) => p.length > 50); // Filter out short paragraphs
|
||||
content = paragraphs.join("\n\n");
|
||||
}
|
||||
|
||||
// Final fallback: use body text
|
||||
if (!content || content.length < 100) {
|
||||
content = $("body").text().replace(/\s+/g, " ").trim();
|
||||
}
|
||||
// Final fallback: use body text
|
||||
if (!content || content.length < 100) {
|
||||
const bodyText = document.body?.textContent || "";
|
||||
content = bodyText.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
return { title, content, description };
|
||||
});
|
||||
|
||||
// Validate extracted content
|
||||
if (!content || content.length < 50) {
|
||||
if (!extractedData.content || extractedData.content.length < 50) {
|
||||
return {
|
||||
title,
|
||||
title: extractedData.title,
|
||||
content: "",
|
||||
description,
|
||||
description: extractedData.description,
|
||||
success: false,
|
||||
error: "Insufficient content extracted",
|
||||
};
|
||||
@ -135,14 +203,15 @@ export async function extractArticleContent(
|
||||
|
||||
// Limit content length to avoid token limits
|
||||
const maxLength = 5000;
|
||||
let content = extractedData.content;
|
||||
if (content.length > maxLength) {
|
||||
content = content.substring(0, maxLength) + "...";
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
title: extractedData.title,
|
||||
content,
|
||||
description,
|
||||
description: extractedData.description,
|
||||
success: true,
|
||||
};
|
||||
} catch (error) {
|
||||
@ -153,6 +222,10 @@ export async function extractArticleContent(
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : "Unknown error occurred",
|
||||
};
|
||||
} finally {
|
||||
if (page) {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user