Update
This commit is contained in:
		@@ -8,6 +8,20 @@ export interface ExtractedContent {
 | 
				
			|||||||
  error?: string;
 | 
					  error?: string;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface RetryOptions {
 | 
				
			||||||
 | 
					  maxRetries: number;
 | 
				
			||||||
 | 
					  baseDelay: number;
 | 
				
			||||||
 | 
					  maxDelay: number;
 | 
				
			||||||
 | 
					  backoffMultiplier: number;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const DEFAULT_RETRY_OPTIONS: RetryOptions = {
 | 
				
			||||||
 | 
					  maxRetries: 3,
 | 
				
			||||||
 | 
					  baseDelay: 1000,
 | 
				
			||||||
 | 
					  maxDelay: 10000,
 | 
				
			||||||
 | 
					  backoffMultiplier: 2
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Singleton browser instance for reuse
 | 
					// Singleton browser instance for reuse
 | 
				
			||||||
let sharedBrowser: Browser | null = null;
 | 
					let sharedBrowser: Browser | null = null;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -146,12 +160,74 @@ async function getBrowser(): Promise<Browser> {
 | 
				
			|||||||
        "--disable-gpu",
 | 
					        "--disable-gpu",
 | 
				
			||||||
        "--disable-web-security",
 | 
					        "--disable-web-security",
 | 
				
			||||||
        "--disable-features=VizDisplayCompositor",
 | 
					        "--disable-features=VizDisplayCompositor",
 | 
				
			||||||
 | 
					        "--disable-background-timer-throttling",
 | 
				
			||||||
 | 
					        "--disable-backgrounding-occluded-windows",
 | 
				
			||||||
 | 
					        "--disable-renderer-backgrounding",
 | 
				
			||||||
 | 
					        "--disable-field-trial-config",
 | 
				
			||||||
 | 
					        "--disable-ipc-flooding-protection",
 | 
				
			||||||
 | 
					        "--enable-automation",
 | 
				
			||||||
 | 
					        "--force-device-scale-factor=1",
 | 
				
			||||||
 | 
					        "--ignore-certificate-errors",
 | 
				
			||||||
 | 
					        "--ignore-ssl-errors",
 | 
				
			||||||
 | 
					        "--ignore-certificate-errors-spki-list",
 | 
				
			||||||
 | 
					        "--allow-running-insecure-content",
 | 
				
			||||||
 | 
					        "--disable-extensions",
 | 
				
			||||||
 | 
					        "--no-default-browser-check",
 | 
				
			||||||
 | 
					        "--disable-default-apps",
 | 
				
			||||||
 | 
					        "--disable-sync",
 | 
				
			||||||
 | 
					        "--metrics-recording-only",
 | 
				
			||||||
 | 
					        "--no-pings",
 | 
				
			||||||
 | 
					        "--mute-audio"
 | 
				
			||||||
      ],
 | 
					      ],
 | 
				
			||||||
    });
 | 
					    });
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return sharedBrowser;
 | 
					  return sharedBrowser;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Helper function for exponential backoff retry
 | 
				
			||||||
 | 
					async function retryWithBackoff<T>(
 | 
				
			||||||
 | 
					  operation: () => Promise<T>,
 | 
				
			||||||
 | 
					  options: RetryOptions = DEFAULT_RETRY_OPTIONS,
 | 
				
			||||||
 | 
					  attempt: number = 1
 | 
				
			||||||
 | 
					): Promise<T> {
 | 
				
			||||||
 | 
					  try {
 | 
				
			||||||
 | 
					    return await operation();
 | 
				
			||||||
 | 
					  } catch (error) {
 | 
				
			||||||
 | 
					    if (attempt >= options.maxRetries) {
 | 
				
			||||||
 | 
					      throw error;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const isRetryableError = error instanceof Error && (
 | 
				
			||||||
 | 
					      error.message.includes('ERR_SOCKET_NOT_CONNECTED') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_CONNECTION_REFUSED') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_CONNECTION_RESET') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_NETWORK_CHANGED') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_INTERNET_DISCONNECTED') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_NAME_NOT_RESOLVED') ||
 | 
				
			||||||
 | 
					      error.message.includes('ERR_TIMED_OUT') ||
 | 
				
			||||||
 | 
					      error.message.includes('Protocol error') ||
 | 
				
			||||||
 | 
					      error.message.includes('Navigation timeout') ||
 | 
				
			||||||
 | 
					      error.message.includes('net::') ||
 | 
				
			||||||
 | 
					      error.message.includes('Target closed') ||
 | 
				
			||||||
 | 
					      error.message.includes('Session closed')
 | 
				
			||||||
 | 
					    );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!isRetryableError) {
 | 
				
			||||||
 | 
					      throw error;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const delay = Math.min(
 | 
				
			||||||
 | 
					      options.baseDelay * Math.pow(options.backoffMultiplier, attempt - 1),
 | 
				
			||||||
 | 
					      options.maxDelay
 | 
				
			||||||
 | 
					    );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    console.log(`Attempt ${attempt} failed, retrying in ${delay}ms:`, error.message);
 | 
				
			||||||
 | 
					    await waitForTimeout(delay);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return retryWithBackoff(operation, options, attempt + 1);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export async function closeBrowser(): Promise<void> {
 | 
					export async function closeBrowser(): Promise<void> {
 | 
				
			||||||
  if (sharedBrowser && sharedBrowser.isConnected()) {
 | 
					  if (sharedBrowser && sharedBrowser.isConnected()) {
 | 
				
			||||||
    await sharedBrowser.close();
 | 
					    await sharedBrowser.close();
 | 
				
			||||||
@@ -159,49 +235,79 @@ export async function closeBrowser(): Promise<void> {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export async function extractArticleContent(
 | 
					async function extractWithRetry(url: string): Promise<ExtractedContent> {
 | 
				
			||||||
  url: string,
 | 
					  const userAgents = [
 | 
				
			||||||
): Promise<ExtractedContent> {
 | 
					    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
				
			||||||
  console.log(`Starting content extraction for: ${url}`);
 | 
					    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
				
			||||||
  let page = null;
 | 
					    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 | 
				
			||||||
  try {
 | 
					    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0"
 | 
				
			||||||
    const browser = await getBrowser();
 | 
					  ];
 | 
				
			||||||
    page = await browser.newPage();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Set user agent and viewport
 | 
					  return retryWithBackoff(async () => {
 | 
				
			||||||
    await page.setUserAgent(
 | 
					    let page = null;
 | 
				
			||||||
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 | 
					    try {
 | 
				
			||||||
    );
 | 
					      const browser = await getBrowser();
 | 
				
			||||||
    await page.setViewport({ width: 1280, height: 720 });
 | 
					      page = await browser.newPage();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Set navigation timeout and disable images for faster loading
 | 
					      // Randomize user agent to avoid detection
 | 
				
			||||||
    page.setDefaultNavigationTimeout(45000);
 | 
					      const userAgent = userAgents[Math.floor(Math.random() * userAgents.length)] || userAgents[0];
 | 
				
			||||||
    page.setDefaultTimeout(45000);
 | 
					      await page.setUserAgent(userAgent!);
 | 
				
			||||||
    
 | 
					      await page.setViewport({ width: 1280, height: 720 });
 | 
				
			||||||
    // Block unnecessary resources to speed up loading
 | 
					
 | 
				
			||||||
    await page.setRequestInterception(true);
 | 
					      // Set longer timeout for problematic sites
 | 
				
			||||||
    page.on('request', (req) => {
 | 
					      page.setDefaultNavigationTimeout(60000);
 | 
				
			||||||
      const resourceType = req.resourceType();
 | 
					      page.setDefaultTimeout(60000);
 | 
				
			||||||
      if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
 | 
					      
 | 
				
			||||||
        req.abort();
 | 
					      // Block unnecessary resources to speed up loading
 | 
				
			||||||
      } else {
 | 
					      await page.setRequestInterception(true);
 | 
				
			||||||
        req.continue();
 | 
					      page.on('request', (req) => {
 | 
				
			||||||
 | 
					        const resourceType = req.resourceType();
 | 
				
			||||||
 | 
					        if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
 | 
				
			||||||
 | 
					          req.abort();
 | 
				
			||||||
 | 
					        } else {
 | 
				
			||||||
 | 
					          req.continue();
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Add extra headers to appear more like a real browser
 | 
				
			||||||
 | 
					      await page.setExtraHTTPHeaders({
 | 
				
			||||||
 | 
					        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 | 
				
			||||||
 | 
					        'Accept-Language': 'en-US,en;q=0.5',
 | 
				
			||||||
 | 
					        'Accept-Encoding': 'gzip, deflate, br',
 | 
				
			||||||
 | 
					        'DNT': '1',
 | 
				
			||||||
 | 
					        'Connection': 'keep-alive',
 | 
				
			||||||
 | 
					        'Upgrade-Insecure-Requests': '1'
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Navigate with multiple wait strategies
 | 
				
			||||||
 | 
					      let response;
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        response = await page.goto(url, {
 | 
				
			||||||
 | 
					          waitUntil: ["domcontentloaded", "networkidle0"],
 | 
				
			||||||
 | 
					          timeout: 60000,
 | 
				
			||||||
 | 
					        });
 | 
				
			||||||
 | 
					      } catch (networkError) {
 | 
				
			||||||
 | 
					        // Fallback to more basic wait strategy
 | 
				
			||||||
 | 
					        const errorMessage = networkError instanceof Error ? networkError.message : 'Unknown error';
 | 
				
			||||||
 | 
					        console.log('Network idle failed, trying domcontentloaded only:', errorMessage);
 | 
				
			||||||
 | 
					        response = await page.goto(url, {
 | 
				
			||||||
 | 
					          waitUntil: "domcontentloaded",
 | 
				
			||||||
 | 
					          timeout: 60000,
 | 
				
			||||||
 | 
					        });
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    });
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Navigate to the page with better waiting strategy
 | 
					      if (!response) {
 | 
				
			||||||
    const response = await page.goto(url, {
 | 
					        throw new Error('No response received from server');
 | 
				
			||||||
      waitUntil: "domcontentloaded",
 | 
					      }
 | 
				
			||||||
      timeout: 45000,
 | 
					 | 
				
			||||||
    });
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (!response || !response.ok()) {
 | 
					      const status = response?.status();
 | 
				
			||||||
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
 | 
					      if (status && status >= 400) {
 | 
				
			||||||
    }
 | 
					        throw new Error(`HTTP ${status}: ${response?.statusText() || 'Unknown error'}`);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Enhanced dynamic content handling
 | 
					      // Enhanced dynamic content handling
 | 
				
			||||||
    console.log('Handling dynamic content...');
 | 
					      console.log('Handling dynamic content...');
 | 
				
			||||||
    await handleDynamicContent(page);
 | 
					      await handleDynamicContent(page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Extract content using advanced multi-strategy approach
 | 
					    // Extract content using advanced multi-strategy approach
 | 
				
			||||||
    console.log('Extracting content using multi-strategy approach...');
 | 
					    console.log('Extracting content using multi-strategy approach...');
 | 
				
			||||||
@@ -420,13 +526,13 @@ export async function extractArticleContent(
 | 
				
			|||||||
      
 | 
					      
 | 
				
			||||||
      console.log(`Found ${candidates.length} content candidates`);
 | 
					      console.log(`Found ${candidates.length} content candidates`);
 | 
				
			||||||
      if (candidates.length > 0) {
 | 
					      if (candidates.length > 0) {
 | 
				
			||||||
        console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
 | 
					        console.log(`Best candidate score: ${candidates[0]!.score}, selector: ${candidates[0]!.selector}`);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      // Get the best content
 | 
					      // Get the best content
 | 
				
			||||||
      let content = "";
 | 
					      let content = "";
 | 
				
			||||||
      if (candidates.length > 0) {
 | 
					      if (candidates.length > 0) {
 | 
				
			||||||
        content = candidates[0].content;
 | 
					        content = candidates[0]!.content;
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        // If the best candidate is still short, try combining top candidates
 | 
					        // If the best candidate is still short, try combining top candidates
 | 
				
			||||||
        if (content.length < 500 && candidates.length > 1) {
 | 
					        if (content.length < 500 && candidates.length > 1) {
 | 
				
			||||||
@@ -513,9 +619,9 @@ export async function extractArticleContent(
 | 
				
			|||||||
        extractedData.content = fallbackData;
 | 
					        extractedData.content = fallbackData;
 | 
				
			||||||
      } else {
 | 
					      } else {
 | 
				
			||||||
        return {
 | 
					        return {
 | 
				
			||||||
          title: extractedData.title,
 | 
					          title: extractedData.title || '',
 | 
				
			||||||
          content: extractedData.content || "",
 | 
					          content: extractedData.content || "",
 | 
				
			||||||
          description: extractedData.description,
 | 
					          description: extractedData.description || '',
 | 
				
			||||||
          success: false,
 | 
					          success: false,
 | 
				
			||||||
          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
 | 
					          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
 | 
				
			||||||
        };
 | 
					        };
 | 
				
			||||||
@@ -529,26 +635,65 @@ export async function extractArticleContent(
 | 
				
			|||||||
      content = content.substring(0, maxLength) + "...";
 | 
					      content = content.substring(0, maxLength) + "...";
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    console.log(`Successfully extracted content: ${content.length} characters`);
 | 
					      console.log(`Successfully extracted content: ${content.length} characters`);
 | 
				
			||||||
    return {
 | 
					      return {
 | 
				
			||||||
      title: extractedData.title,
 | 
					        title: extractedData.title,
 | 
				
			||||||
      content,
 | 
					        content,
 | 
				
			||||||
      description: extractedData.description,
 | 
					        description: extractedData.description,
 | 
				
			||||||
      success: true,
 | 
					        success: true,
 | 
				
			||||||
    };
 | 
					      };
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      console.error(`Content extraction attempt failed:`, error);
 | 
				
			||||||
 | 
					      throw error; // Let retry logic handle this
 | 
				
			||||||
 | 
					    } finally {
 | 
				
			||||||
 | 
					      if (page) {
 | 
				
			||||||
 | 
					        try {
 | 
				
			||||||
 | 
					          await page.close();
 | 
				
			||||||
 | 
					        } catch (closeError) {
 | 
				
			||||||
 | 
					          console.warn('Failed to close page:', closeError);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export async function extractArticleContent(
 | 
				
			||||||
 | 
					  url: string,
 | 
				
			||||||
 | 
					): Promise<ExtractedContent> {
 | 
				
			||||||
 | 
					  console.log(`Starting content extraction for: ${url}`);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  try {
 | 
				
			||||||
 | 
					    return await extractWithRetry(url);
 | 
				
			||||||
  } catch (error) {
 | 
					  } catch (error) {
 | 
				
			||||||
    console.error(`Content extraction failed for ${url}:`, error);
 | 
					    console.error(`Content extraction failed after all retries for ${url}:`, error);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Provide more specific error messages
 | 
				
			||||||
 | 
					    let errorMessage = "Unknown error occurred";
 | 
				
			||||||
 | 
					    if (error instanceof Error) {
 | 
				
			||||||
 | 
					      if (error.message.includes('ERR_SOCKET_NOT_CONNECTED')) {
 | 
				
			||||||
 | 
					        errorMessage = "Network connection failed - server may be unreachable";
 | 
				
			||||||
 | 
					      } else if (error.message.includes('ERR_CONNECTION_REFUSED')) {
 | 
				
			||||||
 | 
					        errorMessage = "Connection refused by server";
 | 
				
			||||||
 | 
					      } else if (error.message.includes('ERR_NAME_NOT_RESOLVED')) {
 | 
				
			||||||
 | 
					        errorMessage = "DNS resolution failed - domain may not exist";
 | 
				
			||||||
 | 
					      } else if (error.message.includes('ERR_TIMED_OUT')) {
 | 
				
			||||||
 | 
					        errorMessage = "Request timed out - server too slow";
 | 
				
			||||||
 | 
					      } else if (error.message.includes('HTTP 4')) {
 | 
				
			||||||
 | 
					        errorMessage = `Client error: ${error.message}`;
 | 
				
			||||||
 | 
					      } else if (error.message.includes('HTTP 5')) {
 | 
				
			||||||
 | 
					        errorMessage = `Server error: ${error.message}`;
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        errorMessage = error.message;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    return {
 | 
					    return {
 | 
				
			||||||
      title: "",
 | 
					      title: "",
 | 
				
			||||||
      content: "",
 | 
					      content: "",
 | 
				
			||||||
      description: "",
 | 
					      description: "",
 | 
				
			||||||
      success: false,
 | 
					      success: false,
 | 
				
			||||||
      error: error instanceof Error ? error.message : "Unknown error occurred",
 | 
					      error: errorMessage,
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
  } finally {
 | 
					 | 
				
			||||||
    if (page) {
 | 
					 | 
				
			||||||
      await page.close();
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user