Update
Some checks failed
Build and Publish Docker Images / build (push) Failing after 4m16s
CI / security-scan (push) Failing after 12m2s
CI / lint-and-test (push) Failing after 12m8s
CI / docker-test (push) Has been cancelled

This commit is contained in:
2025-06-12 12:20:25 +09:00
parent 0760909de1
commit 886dc94429

View File

@ -8,6 +8,20 @@ export interface ExtractedContent {
error?: string;
}
interface RetryOptions {
maxRetries: number;
baseDelay: number;
maxDelay: number;
backoffMultiplier: number;
}
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
maxRetries: 3,
baseDelay: 1000,
maxDelay: 10000,
backoffMultiplier: 2
};
// Singleton browser instance for reuse
let sharedBrowser: Browser | null = null;
@ -146,12 +160,74 @@ async function getBrowser(): Promise<Browser> {
"--disable-gpu",
"--disable-web-security",
"--disable-features=VizDisplayCompositor",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-renderer-backgrounding",
"--disable-field-trial-config",
"--disable-ipc-flooding-protection",
"--enable-automation",
"--force-device-scale-factor=1",
"--ignore-certificate-errors",
"--ignore-ssl-errors",
"--ignore-certificate-errors-spki-list",
"--allow-running-insecure-content",
"--disable-extensions",
"--no-default-browser-check",
"--disable-default-apps",
"--disable-sync",
"--metrics-recording-only",
"--no-pings",
"--mute-audio"
],
});
}
return sharedBrowser;
}
// Helper function for exponential backoff retry
async function retryWithBackoff<T>(
operation: () => Promise<T>,
options: RetryOptions = DEFAULT_RETRY_OPTIONS,
attempt: number = 1
): Promise<T> {
try {
return await operation();
} catch (error) {
if (attempt >= options.maxRetries) {
throw error;
}
const isRetryableError = error instanceof Error && (
error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
error.message.includes('ERR_CONNECTION_REFUSED') ||
error.message.includes('ERR_CONNECTION_RESET') ||
error.message.includes('ERR_NETWORK_CHANGED') ||
error.message.includes('ERR_INTERNET_DISCONNECTED') ||
error.message.includes('ERR_NAME_NOT_RESOLVED') ||
error.message.includes('ERR_TIMED_OUT') ||
error.message.includes('Protocol error') ||
error.message.includes('Navigation timeout') ||
error.message.includes('net::') ||
error.message.includes('Target closed') ||
error.message.includes('Session closed')
);
if (!isRetryableError) {
throw error;
}
const delay = Math.min(
options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
options.maxDelay
);
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
await waitForTimeout(delay);
return retryWithBackoff(operation, options, attempt + 1);
}
}
export async function closeBrowser(): Promise<void> {
if (sharedBrowser && sharedBrowser.isConnected()) {
await sharedBrowser.close();
@ -159,49 +235,79 @@ export async function closeBrowser(): Promise<void> {
}
}
export async function extractArticleContent(
url: string,
): Promise<ExtractedContent> {
console.log(`Starting content extraction for: ${url}`);
let page = null;
try {
const browser = await getBrowser();
page = await browser.newPage();
async function extractWithRetry(url: string): Promise<ExtractedContent> {
const userAgents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
];
// Set user agent and viewport
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
);
await page.setViewport({ width: 1280, height: 720 });
return retryWithBackoff(async () => {
let page = null;
try {
const browser = await getBrowser();
page = await browser.newPage();
// Set navigation timeout and disable images for faster loading
page.setDefaultNavigationTimeout(45000);
page.setDefaultTimeout(45000);
// Block unnecessary resources to speed up loading
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
req.abort();
} else {
req.continue();
// Randomize user agent to avoid detection
const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
await page.setUserAgent(userAgent!);
await page.setViewport({ width: 1280, height: 720 });
// Set longer timeout for problematic sites
page.setDefaultNavigationTimeout(60000);
page.setDefaultTimeout(60000);
// Block unnecessary resources to speed up loading
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
req.abort();
} else {
req.continue();
}
});
// Add extra headers to appear more like a real browser
await page.setExtraHTTPHeaders({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
});
// Navigate with multiple wait strategies
let response;
try {
response = await page.goto(url, {
waitUntil: ["domcontentloaded", "networkidle0"],
timeout: 60000,
});
} catch (networkError) {
// Fallback to more basic wait strategy
const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 60000,
});
}
});
// Navigate to the page with better waiting strategy
const response = await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 45000,
});
if (!response) {
throw new Error('No response received from server');
}
if (!response || !response.ok()) {
throw new Error(`HTTP ${response?.status()}: Failed to load page`);
}
const status = response?.status();
if (status && status >= 400) {
throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
}
// Enhanced dynamic content handling
console.log('Handling dynamic content...');
await handleDynamicContent(page);
// Enhanced dynamic content handling
console.log('Handling dynamic content...');
await handleDynamicContent(page);
// Extract content using advanced multi-strategy approach
console.log('Extracting content using multi-strategy approach...');
@ -420,13 +526,13 @@ export async function extractArticleContent(
console.log(`Found ${candidates.length} content candidates`);
if (candidates.length > 0) {
console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
}
// Get the best content
let content = "";
if (candidates.length > 0) {
content = candidates[0].content;
content = candidates[0]!.content;
// If the best candidate is still short, try combining top candidates
if (content.length < 500 && candidates.length > 1) {
@ -513,9 +619,9 @@ export async function extractArticleContent(
extractedData.content = fallbackData;
} else {
return {
title: extractedData.title,
title: extractedData.title || '',
content: extractedData.content || "",
description: extractedData.description,
description: extractedData.description || '',
success: false,
error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
};
@ -529,26 +635,65 @@ export async function extractArticleContent(
content = content.substring(0, maxLength) + "...";
}
console.log(`Successfully extracted content: ${content.length} characters`);
return {
title: extractedData.title,
content,
description: extractedData.description,
success: true,
};
console.log(`Successfully extracted content: ${content.length} characters`);
return {
title: extractedData.title,
content,
description: extractedData.description,
success: true,
};
} catch (error) {
console.error(`Content extraction attempt failed:`, error);
throw error; // Let retry logic handle this
} finally {
if (page) {
try {
await page.close();
} catch (closeError) {
console.warn('Failed to close page:', closeError);
}
}
}
});
}
export async function extractArticleContent(
url: string,
): Promise<ExtractedContent> {
console.log(`Starting content extraction for: ${url}`);
try {
return await extractWithRetry(url);
} catch (error) {
console.error(`Content extraction failed for ${url}:`, error);
console.error(`Content extraction failed after all retries for ${url}:`, error);
// Provide more specific error messages
let errorMessage = "Unknown error occurred";
if (error instanceof Error) {
if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
errorMessage = "Network connection failed - server may be unreachable";
} else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
errorMessage = "Connection refused by server";
} else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
errorMessage = "DNS resolution failed - domain may not exist";
} else if (error.message.includes('ERR_TIMED_OUT')) {
errorMessage = "Request timed out - server too slow";
} else if (error.message.includes('HTTP 4')) {
errorMessage = `Client error: ${error.message}`;
} else if (error.message.includes('HTTP 5')) {
errorMessage = `Server error: ${error.message}`;
} else {
errorMessage = error.message;
}
}
return {
title: "",
content: "",
description: "",
success: false,
error: error instanceof Error ? error.message : "Unknown error occurred",
error: errorMessage,
};
} finally {
if (page) {
await page.close();
}
}
}