190 lines
4.8 KiB
TypeScript
190 lines
4.8 KiB
TypeScript
import * as cheerio from "cheerio";
|
|
|
|
export interface ExtractedContent {
|
|
title?: string;
|
|
content: string;
|
|
description?: string;
|
|
success: boolean;
|
|
error?: string;
|
|
}
|
|
|
|
export async function extractArticleContent(
|
|
url: string,
|
|
): Promise<ExtractedContent> {
|
|
try {
|
|
// Fetch the HTML content
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
Accept:
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
Connection: "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
},
|
|
signal: AbortSignal.timeout(30000), // 30 second timeout
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
}
|
|
|
|
const html = await response.text();
|
|
const $ = cheerio.load(html);
|
|
|
|
// Remove unwanted elements
|
|
$(
|
|
"script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
|
|
).remove();
|
|
|
|
let content = "";
|
|
let title = "";
|
|
let description = "";
|
|
|
|
// Extract title
|
|
title =
|
|
$("title").text().trim() ||
|
|
$("h1").first().text().trim() ||
|
|
$('meta[property="og:title"]').attr("content") ||
|
|
"";
|
|
|
|
// Extract description
|
|
description =
|
|
$('meta[name="description"]').attr("content") ||
|
|
$('meta[property="og:description"]').attr("content") ||
|
|
"";
|
|
|
|
// Try multiple content extraction strategies
|
|
const contentSelectors = [
|
|
// Common article selectors
|
|
"article",
|
|
'[role="main"]',
|
|
".article-content",
|
|
".post-content",
|
|
".entry-content",
|
|
".content",
|
|
".main-content",
|
|
".article-body",
|
|
".post-body",
|
|
".story-body",
|
|
".news-content",
|
|
|
|
// Japanese news site specific selectors
|
|
".article",
|
|
".news-article",
|
|
".post",
|
|
".entry",
|
|
"#content",
|
|
"#main",
|
|
".main",
|
|
|
|
// Fallback to common containers
|
|
".container",
|
|
"#container",
|
|
"main",
|
|
"body",
|
|
];
|
|
|
|
for (const selector of contentSelectors) {
|
|
const element = $(selector);
|
|
if (element.length > 0) {
|
|
// Get text content and clean it up
|
|
let extractedText = element.text().trim();
|
|
|
|
// Remove extra whitespace and normalize
|
|
extractedText = extractedText
|
|
.replace(/\s+/g, " ")
|
|
.replace(/\n\s*\n/g, "\n")
|
|
.trim();
|
|
|
|
// Only use if we found substantial content
|
|
if (extractedText.length > 200) {
|
|
content = extractedText;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// If still no content, try paragraph extraction
|
|
if (!content) {
|
|
const paragraphs = $("p")
|
|
.map((_, el) => $(el).text().trim())
|
|
.get();
|
|
content = paragraphs
|
|
.filter((p) => p.length > 50) // Filter out short paragraphs
|
|
.join("\n\n");
|
|
}
|
|
|
|
// Final fallback: use body text
|
|
if (!content || content.length < 100) {
|
|
content = $("body").text().replace(/\s+/g, " ").trim();
|
|
}
|
|
|
|
// Validate extracted content
|
|
if (!content || content.length < 50) {
|
|
return {
|
|
title,
|
|
content: "",
|
|
description,
|
|
success: false,
|
|
error: "Insufficient content extracted",
|
|
};
|
|
}
|
|
|
|
// Limit content length to avoid token limits
|
|
const maxLength = 5000;
|
|
if (content.length > maxLength) {
|
|
content = content.substring(0, maxLength) + "...";
|
|
}
|
|
|
|
return {
|
|
title,
|
|
content,
|
|
description,
|
|
success: true,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
title: "",
|
|
content: "",
|
|
description: "",
|
|
success: false,
|
|
error: error instanceof Error ? error.message : "Unknown error occurred",
|
|
};
|
|
}
|
|
}
|
|
|
|
export async function enhanceArticleContent(
|
|
originalTitle: string,
|
|
originalLink: string,
|
|
originalContent?: string,
|
|
originalDescription?: string,
|
|
): Promise<{ content?: string; description?: string }> {
|
|
// If we already have substantial content, use it
|
|
const existingContent = originalContent || originalDescription || "";
|
|
if (existingContent.length > 500) {
|
|
return {
|
|
content: originalContent,
|
|
description: originalDescription,
|
|
};
|
|
}
|
|
|
|
// Try to extract content from the URL
|
|
const extracted = await extractArticleContent(originalLink);
|
|
|
|
if (extracted.success && extracted.content) {
|
|
return {
|
|
content: extracted.content,
|
|
description: extracted.description || originalDescription,
|
|
};
|
|
}
|
|
|
|
// Return original content if extraction failed
|
|
return {
|
|
content: originalContent,
|
|
description: originalDescription,
|
|
};
|
|
}
|