Update content extractor
This commit is contained in:
		@@ -11,6 +11,122 @@ export interface ExtractedContent {
 | 
			
		||||
// Singleton browser instance for reuse
 | 
			
		||||
let sharedBrowser: Browser | null = null;
 | 
			
		||||
 | 
			
		||||
// Dynamic content handling function
 | 
			
		||||
async function handleDynamicContent(page: any): Promise<void> {
 | 
			
		||||
  try {
 | 
			
		||||
    console.log('Starting dynamic content handling...');
 | 
			
		||||
    // Wait for initial content
 | 
			
		||||
    await page.waitForSelector('body', { timeout: 5000 });
 | 
			
		||||
    
 | 
			
		||||
    // Progressive loading strategy
 | 
			
		||||
    const loadingStrategies = [
 | 
			
		||||
      // Strategy 1: Wait for common loading indicators to disappear
 | 
			
		||||
      async () => {
 | 
			
		||||
        const loadingSelectors = [
 | 
			
		||||
          '.loading', '.loader', '.spinner', '.skeleton',
 | 
			
		||||
          '[class*="loading"]', '[class*="skeleton"]',
 | 
			
		||||
          '.placeholder', '.shimmer'
 | 
			
		||||
        ];
 | 
			
		||||
        
 | 
			
		||||
        for (const selector of loadingSelectors) {
 | 
			
		||||
          try {
 | 
			
		||||
            await page.waitForSelector(selector, { timeout: 2000 });
 | 
			
		||||
            await page.waitForSelector(selector, { hidden: true, timeout: 10000 });
 | 
			
		||||
            break;
 | 
			
		||||
          } catch (e) {
 | 
			
		||||
            // Continue to next selector
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      },
 | 
			
		||||
      
 | 
			
		||||
      // Strategy 2: Auto-scroll to trigger lazy loading
 | 
			
		||||
      async () => {
 | 
			
		||||
        await page.evaluate(() => {
 | 
			
		||||
          return new Promise<void>((resolve) => {
 | 
			
		||||
            let totalHeight = 0;
 | 
			
		||||
            const distance = 500;
 | 
			
		||||
            const timer = setInterval(() => {
 | 
			
		||||
              const scrollHeight = document.body.scrollHeight;
 | 
			
		||||
              window.scrollBy(0, distance);
 | 
			
		||||
              totalHeight += distance;
 | 
			
		||||
              
 | 
			
		||||
              if (totalHeight >= scrollHeight || totalHeight > 5000) {
 | 
			
		||||
                clearInterval(timer);
 | 
			
		||||
                window.scrollTo(0, 0); // Scroll back to top
 | 
			
		||||
                setTimeout(() => resolve(), 1000);
 | 
			
		||||
              }
 | 
			
		||||
            }, 200);
 | 
			
		||||
          });
 | 
			
		||||
        });
 | 
			
		||||
      },
 | 
			
		||||
      
 | 
			
		||||
      // Strategy 3: Wait for content-specific indicators
 | 
			
		||||
      async () => {
 | 
			
		||||
        const contentSelectors = [
 | 
			
		||||
          'article', '.article-content', '.post-content', '.entry-content',
 | 
			
		||||
          'main', '[role="main"]', '.main-content'
 | 
			
		||||
        ];
 | 
			
		||||
        
 | 
			
		||||
        for (const selector of contentSelectors) {
 | 
			
		||||
          try {
 | 
			
		||||
            await page.waitForSelector(selector, { timeout: 3000 });
 | 
			
		||||
            break;
 | 
			
		||||
          } catch (e) {
 | 
			
		||||
            // Continue to next selector
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      },
 | 
			
		||||
      
 | 
			
		||||
      // Strategy 4: Handle "Read More" or expansion buttons
 | 
			
		||||
      async () => {
 | 
			
		||||
        const expandButtons = [
 | 
			
		||||
          'button[class*="read-more"]', 'button[class*="expand"]',
 | 
			
		||||
          '.read-more', '.show-more', '.expand-content',
 | 
			
		||||
          'a[class*="read-more"]', 'a[class*="continue"]'
 | 
			
		||||
        ];
 | 
			
		||||
        
 | 
			
		||||
        for (const selector of expandButtons) {
 | 
			
		||||
          try {
 | 
			
		||||
            const button = await page.$(selector);
 | 
			
		||||
            if (button) {
 | 
			
		||||
              await button.click();
 | 
			
		||||
              await page.waitForTimeout(2000);
 | 
			
		||||
              break;
 | 
			
		||||
            }
 | 
			
		||||
          } catch (e) {
 | 
			
		||||
            // Continue to next button
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    ];
 | 
			
		||||
    
 | 
			
		||||
    // Execute strategies with timeouts
 | 
			
		||||
    const executeWithTimeout = async (strategy: () => Promise<void>, timeout: number) => {
 | 
			
		||||
      return Promise.race([
 | 
			
		||||
        strategy(),
 | 
			
		||||
        new Promise<void>((resolve) => setTimeout(resolve, timeout))
 | 
			
		||||
      ]);
 | 
			
		||||
    };
 | 
			
		||||
    
 | 
			
		||||
    // Execute all strategies in parallel with timeouts
 | 
			
		||||
    await Promise.allSettled([
 | 
			
		||||
      executeWithTimeout(loadingStrategies[0]!, 3000),
 | 
			
		||||
      executeWithTimeout(loadingStrategies[1]!, 8000),
 | 
			
		||||
      executeWithTimeout(loadingStrategies[2]!, 5000),
 | 
			
		||||
      executeWithTimeout(loadingStrategies[3]!, 3000)
 | 
			
		||||
    ]);
 | 
			
		||||
    
 | 
			
		||||
    // Final wait for any remaining dynamic content
 | 
			
		||||
    await page.waitForTimeout(2000);
 | 
			
		||||
    
 | 
			
		||||
  } catch (error) {
 | 
			
		||||
    console.log('Dynamic content handling failed, using basic timeout:', error);
 | 
			
		||||
    // If dynamic content handling fails, continue with basic timeout
 | 
			
		||||
    await page.waitForTimeout(3000);
 | 
			
		||||
  }
 | 
			
		||||
  console.log('Dynamic content handling completed.');
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async function getBrowser(): Promise<Browser> {
 | 
			
		||||
  if (!sharedBrowser || !sharedBrowser.isConnected()) {
 | 
			
		||||
    sharedBrowser = await puppeteer.launch({
 | 
			
		||||
@@ -41,6 +157,7 @@ export async function closeBrowser(): Promise<void> {
 | 
			
		||||
export async function extractArticleContent(
 | 
			
		||||
  url: string,
 | 
			
		||||
): Promise<ExtractedContent> {
 | 
			
		||||
  console.log(`Starting content extraction for: ${url}`);
 | 
			
		||||
  let page = null;
 | 
			
		||||
  try {
 | 
			
		||||
    const browser = await getBrowser();
 | 
			
		||||
@@ -52,155 +169,352 @@ export async function extractArticleContent(
 | 
			
		||||
    );
 | 
			
		||||
    await page.setViewport({ width: 1280, height: 720 });
 | 
			
		||||
 | 
			
		||||
    // Set navigation timeout
 | 
			
		||||
    page.setDefaultNavigationTimeout(30000);
 | 
			
		||||
    page.setDefaultTimeout(30000);
 | 
			
		||||
    // Set navigation timeout and disable images for faster loading
 | 
			
		||||
    page.setDefaultNavigationTimeout(45000);
 | 
			
		||||
    page.setDefaultTimeout(45000);
 | 
			
		||||
    
 | 
			
		||||
    // Navigate to the page
 | 
			
		||||
    // Block unnecessary resources to speed up loading
 | 
			
		||||
    await page.setRequestInterception(true);
 | 
			
		||||
    page.on('request', (req) => {
 | 
			
		||||
      const resourceType = req.resourceType();
 | 
			
		||||
      if (resourceType === 'image' || resourceType === 'media' || resourceType === 'font') {
 | 
			
		||||
        req.abort();
 | 
			
		||||
      } else {
 | 
			
		||||
        req.continue();
 | 
			
		||||
      }
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    // Navigate to the page with better waiting strategy
 | 
			
		||||
    const response = await page.goto(url, {
 | 
			
		||||
      waitUntil: "networkidle2",
 | 
			
		||||
      timeout: 30000,
 | 
			
		||||
      waitUntil: "domcontentloaded",
 | 
			
		||||
      timeout: 45000,
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    if (!response || !response.ok()) {
 | 
			
		||||
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Wait for potential dynamic content
 | 
			
		||||
    await new Promise((resolve) => setTimeout(resolve, 2000));
 | 
			
		||||
    // Enhanced dynamic content handling
 | 
			
		||||
    console.log('Handling dynamic content...');
 | 
			
		||||
    await handleDynamicContent(page);
 | 
			
		||||
 | 
			
		||||
    // Extract content using page.evaluate
 | 
			
		||||
    // Extract content using advanced multi-strategy approach
 | 
			
		||||
    console.log('Extracting content using multi-strategy approach...');
 | 
			
		||||
    const extractedData = await page.evaluate(() => {
 | 
			
		||||
      // Remove unwanted elements
 | 
			
		||||
      interface ContentCandidate {
 | 
			
		||||
        element: Element;
 | 
			
		||||
        score: number;
 | 
			
		||||
        content: string;
 | 
			
		||||
        selector: string;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Remove unwanted elements first
 | 
			
		||||
      const unwantedSelectors = [
 | 
			
		||||
        "script",
 | 
			
		||||
        "style",
 | 
			
		||||
        "nav",
 | 
			
		||||
        "header",
 | 
			
		||||
        "footer",
 | 
			
		||||
        "aside",
 | 
			
		||||
        ".advertisement",
 | 
			
		||||
        ".ads",
 | 
			
		||||
        ".sidebar",
 | 
			
		||||
        ".menu",
 | 
			
		||||
        ".navigation",
 | 
			
		||||
        ".social-share",
 | 
			
		||||
        ".comments",
 | 
			
		||||
        ".cookie-banner",
 | 
			
		||||
        ".popup",
 | 
			
		||||
        ".modal",
 | 
			
		||||
        "script", "style", "noscript", "iframe", "embed", "object",
 | 
			
		||||
        "nav", "header", "footer", "aside", "form",
 | 
			
		||||
        ".advertisement", ".ads", ".ad", ".adsbygoogle", "[class*='ad-']", "[id*='ad-']",
 | 
			
		||||
        ".sidebar", ".menu", ".navigation", ".nav", ".breadcrumb",
 | 
			
		||||
        ".social-share", ".share", ".social", ".sns",
 | 
			
		||||
        ".comments", ".comment", ".disqus",
 | 
			
		||||
        ".cookie-banner", ".cookie", ".gdpr",
 | 
			
		||||
        ".popup", ".modal", ".overlay", ".lightbox",
 | 
			
		||||
        ".related", ".recommended", ".more-stories",
 | 
			
		||||
        ".tags", ".categories", ".metadata",
 | 
			
		||||
        ".author-bio", ".author-info",
 | 
			
		||||
        ".newsletter", ".subscribe", ".signup",
 | 
			
		||||
        "[role='complementary']", "[role='banner']", "[role='contentinfo']",
 | 
			
		||||
        "[aria-label*='advertisement']", "[aria-label*='sidebar']"
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      unwantedSelectors.forEach((selector) => {
 | 
			
		||||
        const elements = document.querySelectorAll(selector);
 | 
			
		||||
        elements.forEach((el) => el.remove());
 | 
			
		||||
        try {
 | 
			
		||||
          const elements = document.querySelectorAll(selector);
 | 
			
		||||
          elements.forEach((el) => el.remove());
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
          // Ignore invalid selectors
 | 
			
		||||
        }
 | 
			
		||||
      });
 | 
			
		||||
 | 
			
		||||
      let content = "";
 | 
			
		||||
      let title = "";
 | 
			
		||||
      let description = "";
 | 
			
		||||
 | 
			
		||||
      // Extract title
 | 
			
		||||
      const titleElement = document.querySelector("title");
 | 
			
		||||
      const h1Element = document.querySelector("h1");
 | 
			
		||||
      const ogTitleMeta = document.querySelector('meta[property="og:title"]');
 | 
			
		||||
 | 
			
		||||
      title =
 | 
			
		||||
        titleElement?.textContent?.trim() ||
 | 
			
		||||
        h1Element?.textContent?.trim() ||
 | 
			
		||||
        ogTitleMeta?.getAttribute("content") ||
 | 
			
		||||
        "";
 | 
			
		||||
 | 
			
		||||
      // Extract description
 | 
			
		||||
      const descriptionMeta = document.querySelector(
 | 
			
		||||
        'meta[name="description"]',
 | 
			
		||||
      );
 | 
			
		||||
      const ogDescriptionMeta = document.querySelector(
 | 
			
		||||
        'meta[property="og:description"]',
 | 
			
		||||
      );
 | 
			
		||||
 | 
			
		||||
      description =
 | 
			
		||||
        descriptionMeta?.getAttribute("content") ||
 | 
			
		||||
        ogDescriptionMeta?.getAttribute("content") ||
 | 
			
		||||
        "";
 | 
			
		||||
 | 
			
		||||
      // Try multiple content extraction strategies
 | 
			
		||||
      const contentSelectors = [
 | 
			
		||||
        // Common article selectors
 | 
			
		||||
        "article",
 | 
			
		||||
        '[role="main"]',
 | 
			
		||||
        ".article-content",
 | 
			
		||||
        ".post-content",
 | 
			
		||||
        ".entry-content",
 | 
			
		||||
        ".content",
 | 
			
		||||
        ".main-content",
 | 
			
		||||
        ".article-body",
 | 
			
		||||
        ".post-body",
 | 
			
		||||
        ".story-body",
 | 
			
		||||
        ".news-content",
 | 
			
		||||
 | 
			
		||||
        // Japanese news site specific selectors
 | 
			
		||||
        ".article",
 | 
			
		||||
        ".news-article",
 | 
			
		||||
        ".post",
 | 
			
		||||
        ".entry",
 | 
			
		||||
        "#content",
 | 
			
		||||
        "#main",
 | 
			
		||||
        ".main",
 | 
			
		||||
 | 
			
		||||
        // Fallback to common containers
 | 
			
		||||
        ".container",
 | 
			
		||||
        "#container",
 | 
			
		||||
        "main",
 | 
			
		||||
        "body",
 | 
			
		||||
      let title = "";
 | 
			
		||||
      const titleSources = [
 | 
			
		||||
        () => document.querySelector('meta[property="og:title"]')?.getAttribute('content'),
 | 
			
		||||
        () => document.querySelector('meta[name="twitter:title"]')?.getAttribute('content'),
 | 
			
		||||
        () => document.querySelector('h1')?.textContent?.trim(),
 | 
			
		||||
        () => document.querySelector('.article-title, .post-title, .entry-title')?.textContent?.trim(),
 | 
			
		||||
        () => document.querySelector('title')?.textContent?.trim(),
 | 
			
		||||
        () => document.querySelector('[itemprop="headline"]')?.textContent?.trim()
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      for (const selector of contentSelectors) {
 | 
			
		||||
        const element = document.querySelector(selector);
 | 
			
		||||
        if (element) {
 | 
			
		||||
          // Get text content and clean it up
 | 
			
		||||
          let extractedText = element.textContent?.trim() || "";
 | 
			
		||||
 | 
			
		||||
          // Remove extra whitespace and normalize
 | 
			
		||||
          extractedText = extractedText
 | 
			
		||||
            .replace(/\s+/g, " ")
 | 
			
		||||
            .replace(/\n\s*\n/g, "\n")
 | 
			
		||||
            .trim();
 | 
			
		||||
 | 
			
		||||
          // Only use if we found substantial content
 | 
			
		||||
          if (extractedText.length > 200) {
 | 
			
		||||
            content = extractedText;
 | 
			
		||||
      for (const source of titleSources) {
 | 
			
		||||
        try {
 | 
			
		||||
          const result = source();
 | 
			
		||||
          if (result && result.length > 0) {
 | 
			
		||||
            title = result;
 | 
			
		||||
            break;
 | 
			
		||||
          }
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
          continue;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Extract description
 | 
			
		||||
      let description = "";
 | 
			
		||||
      const descriptionSources = [
 | 
			
		||||
        () => document.querySelector('meta[property="og:description"]')?.getAttribute('content'),
 | 
			
		||||
        () => document.querySelector('meta[name="description"]')?.getAttribute('content'),
 | 
			
		||||
        () => document.querySelector('meta[name="twitter:description"]')?.getAttribute('content'),
 | 
			
		||||
        () => document.querySelector('[itemprop="description"]')?.textContent?.trim()
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      for (const source of descriptionSources) {
 | 
			
		||||
        try {
 | 
			
		||||
          const result = source();
 | 
			
		||||
          if (result && result.length > 0) {
 | 
			
		||||
            description = result;
 | 
			
		||||
            break;
 | 
			
		||||
          }
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
          continue;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Comprehensive content selectors with priorities
 | 
			
		||||
      const contentSelectors = [
 | 
			
		||||
        // Schema.org and structured data
 | 
			
		||||
        '[itemtype*="Article"] [itemprop="articleBody"]',
 | 
			
		||||
        '[itemtype*="NewsArticle"] [itemprop="articleBody"]',
 | 
			
		||||
        '[itemtype*="BlogPosting"] [itemprop="articleBody"]',
 | 
			
		||||
        
 | 
			
		||||
        // High-priority semantic selectors
 | 
			
		||||
        'article[role="main"]',
 | 
			
		||||
        'main article',
 | 
			
		||||
        '[role="main"] article',
 | 
			
		||||
        'article',
 | 
			
		||||
        
 | 
			
		||||
        // Common CMS and platform selectors
 | 
			
		||||
        '.post-content', '.entry-content', '.article-content', '.content-area',
 | 
			
		||||
        '.article-body', '.post-body', '.entry-body', '.story-body',
 | 
			
		||||
        '.main-content', '.primary-content', '.page-content',
 | 
			
		||||
        '.news-content', '.blog-content', '.editorial-content',
 | 
			
		||||
        
 | 
			
		||||
        // WordPress specific
 | 
			
		||||
        '.wp-content', '.entry', '.post',
 | 
			
		||||
        
 | 
			
		||||
        // Medium, Substack, Ghost
 | 
			
		||||
        '.section-content', '.postArticle-content', '.post-full-content',
 | 
			
		||||
        '.markup', '.section--body', '.section-divider + .section-content',
 | 
			
		||||
        
 | 
			
		||||
        // Japanese sites specific
 | 
			
		||||
        '.honbun', '.main_text', '.article_body', '.news_body',
 | 
			
		||||
        '.entry_text', '.blog_text', '.content_text',
 | 
			
		||||
        '.kiji', '.news', '.article',
 | 
			
		||||
        
 | 
			
		||||
        // Generic semantic HTML5
 | 
			
		||||
        'main', '[role="main"]',
 | 
			
		||||
        
 | 
			
		||||
        // ID-based selectors
 | 
			
		||||
        '#content', '#main', '#article', '#post', '#entry',
 | 
			
		||||
        '#main-content', '#primary', '#content-area',
 | 
			
		||||
        
 | 
			
		||||
        // Class-based common patterns
 | 
			
		||||
        '.content', '.main', '.wrapper', '.container',
 | 
			
		||||
        
 | 
			
		||||
        // Fallbacks
 | 
			
		||||
        'body'
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      // Function to calculate content quality score
 | 
			
		||||
      function calculateContentScore(element: Element): number {
 | 
			
		||||
        if (!element) return 0;
 | 
			
		||||
        
 | 
			
		||||
        const text = element.textContent || '';
 | 
			
		||||
        if (text.length < 100) return 0;
 | 
			
		||||
        
 | 
			
		||||
        let score = 0;
 | 
			
		||||
        
 | 
			
		||||
        // Base score from text length (diminishing returns)
 | 
			
		||||
        score += Math.min(text.length / 100, 50);
 | 
			
		||||
        
 | 
			
		||||
        // Paragraph density
 | 
			
		||||
        const paragraphs = element.querySelectorAll('p');
 | 
			
		||||
        const avgParagraphLength = paragraphs.length > 0 ? 
 | 
			
		||||
          Array.from(paragraphs).reduce((sum, p) => sum + (p.textContent?.length || 0), 0) / paragraphs.length : 0;
 | 
			
		||||
        
 | 
			
		||||
        if (avgParagraphLength > 100) score += 20;
 | 
			
		||||
        if (paragraphs.length > 3) score += 10;
 | 
			
		||||
        
 | 
			
		||||
        // Link density penalty (articles shouldn't be mostly links)
 | 
			
		||||
        const links = element.querySelectorAll('a');
 | 
			
		||||
        const linkText = Array.from(links).reduce((sum, link) => sum + (link.textContent?.length || 0), 0);
 | 
			
		||||
        const linkDensity = text.length > 0 ? linkText / text.length : 0;
 | 
			
		||||
        if (linkDensity < 0.2) score += 15;
 | 
			
		||||
        else if (linkDensity < 0.4) score += 5;
 | 
			
		||||
        else score -= 10;
 | 
			
		||||
        
 | 
			
		||||
        // Bonus for article-like structure
 | 
			
		||||
        if (element.tagName === 'ARTICLE') score += 25;
 | 
			
		||||
        if (element.getAttribute('role') === 'main') score += 20;
 | 
			
		||||
        if (element.querySelector('h1, h2, h3')) score += 10;
 | 
			
		||||
        
 | 
			
		||||
        // Bonus for semantic elements
 | 
			
		||||
        const semanticElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, blockquote, ul, ol');
 | 
			
		||||
        if (semanticElements.length > 5) score += 15;
 | 
			
		||||
        
 | 
			
		||||
        // Penalty for too many images without text
 | 
			
		||||
        const images = element.querySelectorAll('img');
 | 
			
		||||
        if (images.length > text.length / 500) score -= 5;
 | 
			
		||||
        
 | 
			
		||||
        // Penalty for navigation-like content
 | 
			
		||||
        const navWords = ['メニュー', 'ナビ', 'カテゴリ', 'タグ', 'menu', 'navigation', 'nav', 'sidebar'];
 | 
			
		||||
        const className = element.className.toLowerCase();
 | 
			
		||||
        const id = element.id.toLowerCase();
 | 
			
		||||
        if (navWords.some(word => className.includes(word) || id.includes(word))) {
 | 
			
		||||
          score -= 20;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        return Math.max(score, 0);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Function to clean and normalize text
 | 
			
		||||
      function cleanText(text: string): string {
 | 
			
		||||
        return text
 | 
			
		||||
          .replace(/\s+/g, ' ')  // Normalize whitespace
 | 
			
		||||
          .replace(/\n\s*\n\s*\n/g, '\n\n')  // Reduce excessive line breaks
 | 
			
		||||
          .replace(/^\s+|\s+$/g, '')  // Trim
 | 
			
		||||
          .replace(/[\u200B-\u200D\uFEFF]/g, '')  // Remove zero-width characters
 | 
			
		||||
          .trim();
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Collect and score all content candidates
 | 
			
		||||
      const candidates: ContentCandidate[] = [];
 | 
			
		||||
      
 | 
			
		||||
      for (const selector of contentSelectors) {
 | 
			
		||||
        try {
 | 
			
		||||
          const elements = document.querySelectorAll(selector);
 | 
			
		||||
          elements.forEach((element, index) => {
 | 
			
		||||
            const text = element.textContent || '';
 | 
			
		||||
            if (text.length > 200) {  // Minimum content threshold
 | 
			
		||||
              const score = calculateContentScore(element);
 | 
			
		||||
              candidates.push({
 | 
			
		||||
                element,
 | 
			
		||||
                score,
 | 
			
		||||
                content: cleanText(text),
 | 
			
		||||
                selector: `${selector}[${index}]`
 | 
			
		||||
              });
 | 
			
		||||
            }
 | 
			
		||||
          });
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
          // Skip invalid selectors
 | 
			
		||||
          continue;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Sort candidates by score (highest first)
 | 
			
		||||
      candidates.sort((a, b) => b.score - a.score);
 | 
			
		||||
      
 | 
			
		||||
      console.log(`Found ${candidates.length} content candidates`);
 | 
			
		||||
      if (candidates.length > 0) {
 | 
			
		||||
        console.log(`Best candidate score: ${candidates[0].score}, selector: ${candidates[0].selector}`);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Get the best content
 | 
			
		||||
      let content = "";
 | 
			
		||||
      if (candidates.length > 0) {
 | 
			
		||||
        content = candidates[0].content;
 | 
			
		||||
        
 | 
			
		||||
        // If the best candidate is still short, try combining top candidates
 | 
			
		||||
        if (content.length < 500 && candidates.length > 1) {
 | 
			
		||||
          const topCandidates = candidates.slice(0, 3).filter(c => c.score > 10);
 | 
			
		||||
          const combinedContent = topCandidates.map(c => c.content).join('\n\n');
 | 
			
		||||
          if (combinedContent.length > content.length) {
 | 
			
		||||
            content = cleanText(combinedContent);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // If still no content, try paragraph extraction
 | 
			
		||||
      if (!content) {
 | 
			
		||||
        const paragraphs = Array.from(document.querySelectorAll("p"))
 | 
			
		||||
          .map((p) => p.textContent?.trim() || "")
 | 
			
		||||
          .filter((p) => p.length > 50); // Filter out short paragraphs
 | 
			
		||||
        content = paragraphs.join("\n\n");
 | 
			
		||||
      // Fallback strategies if still no good content
 | 
			
		||||
      if (!content || content.length < 200) {
 | 
			
		||||
        // Try paragraph aggregation
 | 
			
		||||
        const paragraphs = Array.from(document.querySelectorAll('p'))
 | 
			
		||||
          .map(p => p.textContent?.trim() || '')
 | 
			
		||||
          .filter(p => p.length > 50)
 | 
			
		||||
          .filter(p => {
 | 
			
		||||
            // Filter out likely navigation/boilerplate paragraphs
 | 
			
		||||
            const lowerP = p.toLowerCase();
 | 
			
		||||
            return !lowerP.includes('cookie') && 
 | 
			
		||||
                   !lowerP.includes('privacy') && 
 | 
			
		||||
                   !lowerP.includes('terms of service') &&
 | 
			
		||||
                   !lowerP.includes('subscribe') &&
 | 
			
		||||
                   !lowerP.includes('newsletter');
 | 
			
		||||
          });
 | 
			
		||||
          
 | 
			
		||||
        if (paragraphs.length > 0) {
 | 
			
		||||
          content = cleanText(paragraphs.join('\n\n'));
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      // Final fallback: use body text
 | 
			
		||||
      if (!content || content.length < 100) {
 | 
			
		||||
        const bodyText = document.body?.textContent || "";
 | 
			
		||||
        content = bodyText.replace(/\s+/g, " ").trim();
 | 
			
		||||
      // Final fallback: structured data
 | 
			
		||||
      if (!content || content.length < 200) {
 | 
			
		||||
        try {
 | 
			
		||||
          const jsonLd = document.querySelector('script[type="application/ld+json"]');
 | 
			
		||||
          if (jsonLd) {
 | 
			
		||||
            const data = JSON.parse(jsonLd.textContent || '{}');
 | 
			
		||||
            if (data.articleBody) {
 | 
			
		||||
              content = cleanText(data.articleBody);
 | 
			
		||||
            } else if (data.text) {
 | 
			
		||||
              content = cleanText(data.text);
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
        } catch (e) {
 | 
			
		||||
          // Ignore JSON parsing errors
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      console.log(`Final content length: ${content.length} characters`);
 | 
			
		||||
      return { title, content, description };
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    // Validate extracted content
 | 
			
		||||
    if (!extractedData.content || extractedData.content.length < 50) {
 | 
			
		||||
      return {
 | 
			
		||||
        title: extractedData.title,
 | 
			
		||||
        content: "",
 | 
			
		||||
        description: extractedData.description,
 | 
			
		||||
        success: false,
 | 
			
		||||
        error: "Insufficient content extracted",
 | 
			
		||||
      };
 | 
			
		||||
    // Validate extracted content with more lenient threshold
 | 
			
		||||
    if (!extractedData.content || extractedData.content.length < 100) {
 | 
			
		||||
      // Try one more extraction attempt with relaxed criteria
 | 
			
		||||
      const fallbackData = await page.evaluate(() => {
 | 
			
		||||
        // Last resort: extract all text from body, excluding common noise
 | 
			
		||||
        const body = document.body;
 | 
			
		||||
        if (body) {
 | 
			
		||||
          // Clone body to avoid modifying original
 | 
			
		||||
          const bodyClone = body.cloneNode(true) as Element;
 | 
			
		||||
          
 | 
			
		||||
          // Remove noise elements from clone
 | 
			
		||||
          const noiseSelectors = [
 | 
			
		||||
            'script', 'style', 'nav', 'header', 'footer', 'aside',
 | 
			
		||||
            '.ad', '.ads', '.advertisement', '[class*="ad-"]',
 | 
			
		||||
            '.menu', '.navigation', '.sidebar', '.social',
 | 
			
		||||
            '.cookie', '.popup', '.modal'
 | 
			
		||||
          ];
 | 
			
		||||
          
 | 
			
		||||
          noiseSelectors.forEach(selector => {
 | 
			
		||||
            const elements = bodyClone.querySelectorAll(selector);
 | 
			
		||||
            elements.forEach(el => el.remove());
 | 
			
		||||
          });
 | 
			
		||||
          
 | 
			
		||||
          const text = bodyClone.textContent || '';
 | 
			
		||||
          return text.replace(/\s+/g, ' ').trim();
 | 
			
		||||
        }
 | 
			
		||||
        return '';
 | 
			
		||||
      });
 | 
			
		||||
      
 | 
			
		||||
      if (fallbackData && fallbackData.length > 200) {
 | 
			
		||||
        extractedData.content = fallbackData;
 | 
			
		||||
      } else {
 | 
			
		||||
        return {
 | 
			
		||||
          title: extractedData.title,
 | 
			
		||||
          content: extractedData.content || "",
 | 
			
		||||
          description: extractedData.description,
 | 
			
		||||
          success: false,
 | 
			
		||||
          error: `Insufficient content extracted (${extractedData.content?.length || 0} characters)`,
 | 
			
		||||
        };
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Limit content length to avoid token limits
 | 
			
		||||
@@ -210,6 +524,7 @@ export async function extractArticleContent(
 | 
			
		||||
      content = content.substring(0, maxLength) + "...";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    console.log(`Successfully extracted content: ${content.length} characters`);
 | 
			
		||||
    return {
 | 
			
		||||
      title: extractedData.title,
 | 
			
		||||
      content,
 | 
			
		||||
@@ -217,6 +532,7 @@ export async function extractArticleContent(
 | 
			
		||||
      success: true,
 | 
			
		||||
    };
 | 
			
		||||
  } catch (error) {
 | 
			
		||||
    console.error(`Content extraction failed for ${url}:`, error);
 | 
			
		||||
    return {
 | 
			
		||||
      title: "",
 | 
			
		||||
      content: "",
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user