Update
This commit is contained in:
		@@ -8,6 +8,20 @@ export interface ExtractedContent {
 | 
			
		||||
  error?: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
interface RetryOptions {
 | 
			
		||||
  maxRetries: number;
 | 
			
		||||
  baseDelay: number;
 | 
			
		||||
  maxDelay: number;
 | 
			
		||||
  backoffMultiplier: number;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
const DEFAULT_RETRY_OPTIONS: RetryOptions = {
 | 
			
		||||
  maxRetries: 3,
 | 
			
		||||
  baseDelay: 1000,
 | 
			
		||||
  maxDelay: 10000,
 | 
			
		||||
  backoffMultiplier: 2
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Singleton browser instance for reuse
 | 
			
		||||
let sharedBrowser: Browser | null = null;
 | 
			
		||||
 | 
			
		||||
@@ -146,12 +160,74 @@ async function getBrowser(): Promise<Browser> {
 | 
			
		||||
        "--disable-gpu",
 | 
			
		||||
        "--disable-web-security",
 | 
			
		||||
        "--disable-features=VizDisplayCompositor",
 | 
			
		||||
        "--disable-background-timer-throttling",
 | 
			
		||||
        "--disable-backgrounding-occluded-windows",
 | 
			
		||||
        "--disable-renderer-backgrounding",
 | 
			
		||||
        "--disable-field-trial-config",
 | 
			
		||||
        "--disable-ipc-flooding-protection",
 | 
			
		||||
        "--enable-automation",
 | 
			
		||||
        "--force-device-scale-factor=1",
 | 
			
		||||
        "--ignore-certificate-errors",
 | 
			
		||||
        "--ignore-ssl-errors",
 | 
			
		||||
        "--ignore-certificate-errors-spki-list",
 | 
			
		||||
        "--allow-running-insecure-content",
 | 
			
		||||
        "--disable-extensions",
 | 
			
		||||
        "--no-default-browser-check",
 | 
			
		||||
        "--disable-default-apps",
 | 
			
		||||
        "--disable-sync",
 | 
			
		||||
        "--metrics-recording-only",
 | 
			
		||||
        "--no-pings",
 | 
			
		||||
        "--mute-audio"
 | 
			
		||||
      ],
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
  return sharedBrowser;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Helper function for exponential backoff retry
 | 
			
		||||
async function retryWithBackoff<T>(
 | 
			
		||||
  operation: () => Promise<T>,
 | 
			
		||||
  options: RetryOptions = DEFAULT_RETRY_OPTIONS,
 | 
			
		||||
  attempt: number = 1
 | 
			
		||||
): Promise<T> {
 | 
			
		||||
  try {
 | 
			
		||||
    return await operation();
 | 
			
		||||
  } catch (error) {
 | 
			
		||||
    if (attempt >= options.maxRetries) {
 | 
			
		||||
      throw error;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const isRetryableError = error instanceof Error && (
 | 
			
		||||
      error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
 | 
			
		||||
      error.message.includes('ERR_CONNECTION_REFUSED') ||
 | 
			
		||||
      error.message.includes('ERR_CONNECTION_RESET') ||
 | 
			
		||||
      error.message.includes('ERR_NETWORK_CHANGED') ||
 | 
			
		||||
      error.message.includes('ERR_INTERNET_DISCONNECTED') ||
 | 
			
		||||
      error.message.includes('ERR_NAME_NOT_RESOLVED') ||
 | 
			
		||||
      error.message.includes('ERR_TIMED_OUT') ||
 | 
			
		||||
      error.message.includes('Protocol error') ||
 | 
			
		||||
      error.message.includes('Navigation timeout') ||
 | 
			
		||||
      error.message.includes('net::') ||
 | 
			
		||||
      error.message.includes('Target closed') ||
 | 
			
		||||
      error.message.includes('Session closed')
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    if (!isRetryableError) {
 | 
			
		||||
      throw error;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const delay = Math.min(
 | 
			
		||||
      options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
 | 
			
		||||
      options.maxDelay
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
 | 
			
		||||
    await waitForTimeout(delay);
 | 
			
		||||
 | 
			
		||||
    return retryWithBackoff(operation, options, attempt + 1);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export async function closeBrowser(): Promise<void> {
 | 
			
		||||
  if (sharedBrowser && sharedBrowser.isConnected()) {
 | 
			
		||||
    await sharedBrowser.close();
 | 
			
		||||
@@ -159,49 +235,79 @@ export async function closeBrowser(): Promise<void> {
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export async function extractArticleContent(
 | 
			
		||||
  url: string,
 | 
			
		||||
): Promise<ExtractedContent> {
 | 
			
		||||
  console.log(`Starting content extraction for: ${url}`);
 | 
			
		||||
  let page = null;
 | 
			
		||||
  try {
 | 
			
		||||
    const browser = await getBrowser();
 | 
			
		||||
    page = await browser.newPage();
 | 
			
		||||
async function extractWithRetry(url: string): Promise<ExtractedContent> {
 | 
			
		||||
  const userAgents = [
 | 
			
		||||
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
			
		||||
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
			
		||||
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
			
		||||
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
 | 
			
		||||
  ];
 | 
			
		||||
 | 
			
		||||
    // Set user agent and viewport
 | 
			
		||||
    await page.setUserAgent(
 | 
			
		||||
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 | 
			
		||||
    );
 | 
			
		||||
    await page.setViewport({ width: 1280, height: 720 });
 | 
			
		||||
  return retryWithBackoff(async () => {
 | 
			
		||||
    let page = null;
 | 
			
		||||
    try {
 | 
			
		||||
      const browser = await getBrowser();
 | 
			
		||||
      page = await browser.newPage();
 | 
			
		||||
 | 
			
		||||
    // Set navigation timeout and disable images for faster loading
 | 
			
		||||
    page.setDefaultNavigationTimeout(45000);
 | 
			
		||||
    page.setDefaultTimeout(45000);
 | 
			
		||||
    
 | 
			
		||||
    // Block unnecessary resources to speed up loading
 | 
			
		||||
    await page.setRequestInterception(true);
 | 
			
		||||
    page.on('request', (req) => {
 | 
			
		||||
      const resourceType = req.resourceType();
 | 
			
		||||
      if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
 | 
			
		||||
        req.abort();
 | 
			
		||||
      } else {
 | 
			
		||||
        req.continue();
 | 
			
		||||
      // Randomize user agent to avoid detection
 | 
			
		||||
      const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
 | 
			
		||||
      await page.setUserAgent(userAgent!);
 | 
			
		||||
      await page.setViewport({ width: 1280, height: 720 });
 | 
			
		||||
 | 
			
		||||
      // Set longer timeout for problematic sites
 | 
			
		||||
      page.setDefaultNavigationTimeout(60000);
 | 
			
		||||
      page.setDefaultTimeout(60000);
 | 
			
		||||
      
 | 
			
		||||
      // Block unnecessary resources to speed up loading
 | 
			
		||||
      await page.setRequestInterception(true);
 | 
			
		||||
      page.on('request', (req) => {
 | 
			
		||||
        const resourceType = req.resourceType();
 | 
			
		||||
        if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
 | 
			
		||||
          req.abort();
 | 
			
		||||
        } else {
 | 
			
		||||
          req.continue();
 | 
			
		||||
        }
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      // Add extra headers to appear more like a real browser
 | 
			
		||||
      await page.setExtraHTTPHeaders({
 | 
			
		||||
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 | 
			
		||||
        'Accept-Language': 'en-US,en;q=0.5',
 | 
			
		||||
        'Accept-Encoding': 'gzip, deflate, br',
 | 
			
		||||
        'DNT': '1',
 | 
			
		||||
        'Connection': 'keep-alive',
 | 
			
		||||
        'Upgrade-Insecure-Requests': '1'
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      // Navigate with multiple wait strategies
 | 
			
		||||
      let response;
 | 
			
		||||
      try {
 | 
			
		||||
        response = await page.goto(url, {
 | 
			
		||||
          waitUntil: ["domcontentloaded", "networkidle0"],
 | 
			
		||||
          timeout: 60000,
 | 
			
		||||
        });
 | 
			
		||||
      } catch (networkError) {
 | 
			
		||||
        // Fallback to more basic wait strategy
 | 
			
		||||
        const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
 | 
			
		||||
        console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
 | 
			
		||||
        response = await page.goto(url, {
 | 
			
		||||
          waitUntil: "domcontentloaded",
 | 
			
		||||
          timeout: 60000,
 | 
			
		||||
        });
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    // Navigate to the page with better waiting strategy
 | 
			
		||||
    const response = await page.goto(url, {
 | 
			
		||||
      waitUntil: "domcontentloaded",
 | 
			
		||||
      timeout: 45000,
 | 
			
		||||
    });
 | 
			
		||||
      if (!response) {
 | 
			
		||||
        throw new Error('No response received from server');
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    if (!response || !response.ok()) {
 | 
			
		||||
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
 | 
			
		||||
    }
 | 
			
		||||
      const status = response?.status();
 | 
			
		||||
      if (status && status >= 400) {
 | 
			
		||||
        throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    // Enhanced dynamic content handling
 | 
			
		||||
    console.log('Handling dynamic content...');
 | 
			
		||||
    await handleDynamicContent(page);
 | 
			
		||||
      // Enhanced dynamic content handling
 | 
			
		||||
      console.log('Handling dynamic content...');
 | 
			
		||||
      await handleDynamicContent(page);
 | 
			
		||||
 | 
			
		||||
    // Extract content using advanced multi-strategy approach
 | 
			
		||||
    console.log('Extracting content using multi-strategy approach...');
 | 
			
		||||
@@ -420,13 +526,13 @@ export async function extractArticleContent(
 | 
			
		||||
      
 | 
			
		||||
      console.log(`Found ${candidates.length} content candidates`);
 | 
			
		||||
      if (candidates.length > 0) {
 | 
			
		||||
        console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
 | 
			
		||||
        console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Get the best content
 | 
			
		||||
      let content = "";
 | 
			
		||||
      if (candidates.length > 0) {
 | 
			
		||||
        content = candidates[0].content;
 | 
			
		||||
        content = candidates[0]!.content;
 | 
			
		||||
        
 | 
			
		||||
        // If the best candidate is still short, try combining top candidates
 | 
			
		||||
        if (content.length < 500 && candidates.length > 1) {
 | 
			
		||||
@@ -513,9 +619,9 @@ export async function extractArticleContent(
 | 
			
		||||
        extractedData.content = fallbackData;
 | 
			
		||||
      } else {
 | 
			
		||||
        return {
 | 
			
		||||
          title: extractedData.title,
 | 
			
		||||
          title: extractedData.title || '',
 | 
			
		||||
          content: extractedData.content || "",
 | 
			
		||||
          description: extractedData.description,
 | 
			
		||||
          description: extractedData.description || '',
 | 
			
		||||
          success: false,
 | 
			
		||||
          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
 | 
			
		||||
        };
 | 
			
		||||
@@ -529,26 +635,65 @@ export async function extractArticleContent(
 | 
			
		||||
      content = content.substring(0, maxLength) + "...";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    console.log(`Successfully extracted content: ${content.length} characters`);
 | 
			
		||||
    return {
 | 
			
		||||
      title: extractedData.title,
 | 
			
		||||
      content,
 | 
			
		||||
      description: extractedData.description,
 | 
			
		||||
      success: true,
 | 
			
		||||
    };
 | 
			
		||||
      console.log(`Successfully extracted content: ${content.length} characters`);
 | 
			
		||||
      return {
 | 
			
		||||
        title: extractedData.title,
 | 
			
		||||
        content,
 | 
			
		||||
        description: extractedData.description,
 | 
			
		||||
        success: true,
 | 
			
		||||
      };
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      console.error(`Content extraction attempt failed:`, error);
 | 
			
		||||
      throw error; // Let retry logic handle this
 | 
			
		||||
    } finally {
 | 
			
		||||
      if (page) {
 | 
			
		||||
        try {
 | 
			
		||||
          await page.close();
 | 
			
		||||
        } catch (closeError) {
 | 
			
		||||
          console.warn('Failed to close page:', closeError);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export async function extractArticleContent(
 | 
			
		||||
  url: string,
 | 
			
		||||
): Promise<ExtractedContent> {
 | 
			
		||||
  console.log(`Starting content extraction for: ${url}`);
 | 
			
		||||
  
 | 
			
		||||
  try {
 | 
			
		||||
    return await extractWithRetry(url);
 | 
			
		||||
  } catch (error) {
 | 
			
		||||
    console.error(`Content extraction failed for ${url}:`, error);
 | 
			
		||||
    console.error(`Content extraction failed after all retries for ${url}:`, error);
 | 
			
		||||
    
 | 
			
		||||
    // Provide more specific error messages
 | 
			
		||||
    let errorMessage = "Unknown error occurred";
 | 
			
		||||
    if (error instanceof Error) {
 | 
			
		||||
      if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
 | 
			
		||||
        errorMessage = "Network connection failed - server may be unreachable";
 | 
			
		||||
      } else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
 | 
			
		||||
        errorMessage = "Connection refused by server";
 | 
			
		||||
      } else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
 | 
			
		||||
        errorMessage = "DNS resolution failed - domain may not exist";
 | 
			
		||||
      } else if (error.message.includes('ERR_TIMED_OUT')) {
 | 
			
		||||
        errorMessage = "Request timed out - server too slow";
 | 
			
		||||
      } else if (error.message.includes('HTTP 4')) {
 | 
			
		||||
        errorMessage = `Client error: ${error.message}`;
 | 
			
		||||
      } else if (error.message.includes('HTTP 5')) {
 | 
			
		||||
        errorMessage = `Server error: ${error.message}`;
 | 
			
		||||
      } else {
 | 
			
		||||
        errorMessage = error.message;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    return {
 | 
			
		||||
      title: "",
 | 
			
		||||
      content: "",
 | 
			
		||||
      description: "",
 | 
			
		||||
      success: false,
 | 
			
		||||
      error: error instanceof Error ? error.message : "Unknown error occurred",
 | 
			
		||||
      error: errorMessage,
 | 
			
		||||
    };
 | 
			
		||||
  } finally {
 | 
			
		||||
    if (page) {
 | 
			
		||||
      await page.close();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user