Apply formatting

This commit is contained in:
2025-06-08 15:21:58 +09:00
parent b5ff912fcb
commit a728ebb66c
28 changed files with 1809 additions and 1137 deletions

View File

@ -1,4 +1,4 @@
import * as cheerio from 'cheerio';
import * as cheerio from "cheerio";
export interface ExtractedContent {
title?: string;
@ -8,17 +8,21 @@ export interface ExtractedContent {
error?: string;
}
export async function extractArticleContent(url: string): Promise<ExtractedContent> {
export async function extractArticleContent(
url: string,
): Promise<ExtractedContent> {
try {
// Fetch the HTML content
const response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ja,en-US;q=0.7,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
Connection: "keep-alive",
"Upgrade-Insecure-Requests": "1",
},
signal: AbortSignal.timeout(30000), // 30 second timeout
});
@ -31,52 +35,56 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte
const $ = cheerio.load(html);
// Remove unwanted elements
$('script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments').remove();
$(
"script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
).remove();
let content = '';
let title = '';
let description = '';
let content = "";
let title = "";
let description = "";
// Extract title
title = $('title').text().trim() ||
$('h1').first().text().trim() ||
$('meta[property="og:title"]').attr('content') ||
'';
title =
$("title").text().trim() ||
$("h1").first().text().trim() ||
$('meta[property="og:title"]').attr("content") ||
"";
// Extract description
description = $('meta[name="description"]').attr('content') ||
$('meta[property="og:description"]').attr('content') ||
'';
description =
$('meta[name="description"]').attr("content") ||
$('meta[property="og:description"]').attr("content") ||
"";
// Try multiple content extraction strategies
const contentSelectors = [
// Common article selectors
'article',
"article",
'[role="main"]',
'.article-content',
'.post-content',
'.entry-content',
'.content',
'.main-content',
'.article-body',
'.post-body',
'.story-body',
'.news-content',
".article-content",
".post-content",
".entry-content",
".content",
".main-content",
".article-body",
".post-body",
".story-body",
".news-content",
// Japanese news site specific selectors
'.article',
'.news-article',
'.post',
'.entry',
'#content',
'#main',
'.main',
".article",
".news-article",
".post",
".entry",
"#content",
"#main",
".main",
// Fallback to common containers
'.container',
'#container',
'main',
'body'
".container",
"#container",
"main",
"body",
];
for (const selector of contentSelectors) {
@ -84,11 +92,11 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte
if (element.length > 0) {
// Get text content and clean it up
let extractedText = element.text().trim();
// Remove extra whitespace and normalize
extractedText = extractedText
.replace(/\s+/g, ' ')
.replace(/\n\s*\n/g, '\n')
.replace(/\s+/g, " ")
.replace(/\n\s*\n/g, "\n")
.trim();
// Only use if we found substantial content
@ -101,50 +109,49 @@ export async function extractArticleContent(url: string): Promise<ExtractedConte
// If still no content, try paragraph extraction
if (!content) {
const paragraphs = $('p').map((_, el) => $(el).text().trim()).get();
const paragraphs = $("p")
.map((_, el) => $(el).text().trim())
.get();
content = paragraphs
.filter(p => p.length > 50) // Filter out short paragraphs
.join('\n\n');
.filter((p) => p.length > 50) // Filter out short paragraphs
.join("\n\n");
}
// Final fallback: use body text
if (!content || content.length < 100) {
content = $('body').text()
.replace(/\s+/g, ' ')
.trim();
content = $("body").text().replace(/\s+/g, " ").trim();
}
// Validate extracted content
if (!content || content.length < 50) {
return {
title,
content: '',
content: "",
description,
success: false,
error: 'Insufficient content extracted'
error: "Insufficient content extracted",
};
}
// Limit content length to avoid token limits
const maxLength = 5000;
if (content.length > maxLength) {
content = content.substring(0, maxLength) + '...';
content = content.substring(0, maxLength) + "...";
}
return {
title,
content,
description,
success: true
success: true,
};
} catch (error) {
return {
title: '',
content: '',
description: '',
title: "",
content: "",
description: "",
success: false,
error: error instanceof Error ? error.message : 'Unknown error occurred'
error: error instanceof Error ? error.message : "Unknown error occurred",
};
}
}
@ -153,30 +160,30 @@ export async function enhanceArticleContent(
originalTitle: string,
originalLink: string,
originalContent?: string,
originalDescription?: string
originalDescription?: string,
): Promise<{ content?: string; description?: string }> {
// If we already have substantial content, use it
const existingContent = originalContent || originalDescription || '';
const existingContent = originalContent || originalDescription || "";
if (existingContent.length > 500) {
return {
content: originalContent,
description: originalDescription
description: originalDescription,
};
}
// Try to extract content from the URL
const extracted = await extractArticleContent(originalLink);
if (extracted.success && extracted.content) {
return {
content: extracted.content,
description: extracted.description || originalDescription
description: extracted.description || originalDescription,
};
}
// Return original content if extraction failed
return {
content: originalContent,
description: originalDescription
description: originalDescription,
};
}
}