265 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
			
		
		
	
	
			265 lines
		
	
	
		
			6.9 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
import puppeteer, { type Browser } from "puppeteer";
 | 
						|
 | 
						|
export interface ExtractedContent {
 | 
						|
  title?: string;
 | 
						|
  content: string;
 | 
						|
  description?: string;
 | 
						|
  success: boolean;
 | 
						|
  error?: string;
 | 
						|
}
 | 
						|
 | 
						|
// Singleton browser instance for reuse
 | 
						|
let sharedBrowser: Browser | null = null;
 | 
						|
 | 
						|
async function getBrowser(): Promise<Browser> {
 | 
						|
  if (!sharedBrowser || !sharedBrowser.isConnected()) {
 | 
						|
    sharedBrowser = await puppeteer.launch({
 | 
						|
      headless: true,
 | 
						|
      args: [
 | 
						|
        "--no-sandbox",
 | 
						|
        "--disable-setuid-sandbox",
 | 
						|
        "--disable-dev-shm-usage",
 | 
						|
        "--disable-accelerated-2d-canvas",
 | 
						|
        "--no-first-run",
 | 
						|
        "--no-zygote",
 | 
						|
        "--disable-gpu",
 | 
						|
        "--disable-web-security",
 | 
						|
        "--disable-features=VizDisplayCompositor",
 | 
						|
      ],
 | 
						|
    });
 | 
						|
  }
 | 
						|
  return sharedBrowser;
 | 
						|
}
 | 
						|
 | 
						|
export async function closeBrowser(): Promise<void> {
 | 
						|
  if (sharedBrowser && sharedBrowser.isConnected()) {
 | 
						|
    await sharedBrowser.close();
 | 
						|
    sharedBrowser = null;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
export async function extractArticleContent(
 | 
						|
  url: string,
 | 
						|
): Promise<ExtractedContent> {
 | 
						|
  let page = null;
 | 
						|
  try {
 | 
						|
    const browser = await getBrowser();
 | 
						|
    page = await browser.newPage();
 | 
						|
 | 
						|
    // Set user agent and viewport
 | 
						|
    await page.setUserAgent(
 | 
						|
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 | 
						|
    );
 | 
						|
    await page.setViewport({ width: 1280, height: 720 });
 | 
						|
 | 
						|
    // Set navigation timeout
 | 
						|
    page.setDefaultNavigationTimeout(30000);
 | 
						|
    page.setDefaultTimeout(30000);
 | 
						|
 | 
						|
    // Navigate to the page
 | 
						|
    const response = await page.goto(url, {
 | 
						|
      waitUntil: "networkidle2",
 | 
						|
      timeout: 30000,
 | 
						|
    });
 | 
						|
 | 
						|
    if (!response || !response.ok()) {
 | 
						|
      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
 | 
						|
    }
 | 
						|
 | 
						|
    // Wait for potential dynamic content
 | 
						|
    await new Promise((resolve) => setTimeout(resolve, 2000));
 | 
						|
 | 
						|
    // Extract content using page.evaluate
 | 
						|
    const extractedData = await page.evaluate(() => {
 | 
						|
      // Remove unwanted elements
 | 
						|
      const unwantedSelectors = [
 | 
						|
        "script",
 | 
						|
        "style",
 | 
						|
        "nav",
 | 
						|
        "header",
 | 
						|
        "footer",
 | 
						|
        "aside",
 | 
						|
        ".advertisement",
 | 
						|
        ".ads",
 | 
						|
        ".sidebar",
 | 
						|
        ".menu",
 | 
						|
        ".navigation",
 | 
						|
        ".social-share",
 | 
						|
        ".comments",
 | 
						|
        ".cookie-banner",
 | 
						|
        ".popup",
 | 
						|
        ".modal",
 | 
						|
      ];
 | 
						|
 | 
						|
      unwantedSelectors.forEach((selector) => {
 | 
						|
        const elements = document.querySelectorAll(selector);
 | 
						|
        elements.forEach((el) => el.remove());
 | 
						|
      });
 | 
						|
 | 
						|
      let content = "";
 | 
						|
      let title = "";
 | 
						|
      let description = "";
 | 
						|
 | 
						|
      // Extract title
 | 
						|
      const titleElement = document.querySelector("title");
 | 
						|
      const h1Element = document.querySelector("h1");
 | 
						|
      const ogTitleMeta = document.querySelector('meta[property="og:title"]');
 | 
						|
 | 
						|
      title =
 | 
						|
        titleElement?.textContent?.trim() ||
 | 
						|
        h1Element?.textContent?.trim() ||
 | 
						|
        ogTitleMeta?.getAttribute("content") ||
 | 
						|
        "";
 | 
						|
 | 
						|
      // Extract description
 | 
						|
      const descriptionMeta = document.querySelector(
 | 
						|
        'meta[name="description"]',
 | 
						|
      );
 | 
						|
      const ogDescriptionMeta = document.querySelector(
 | 
						|
        'meta[property="og:description"]',
 | 
						|
      );
 | 
						|
 | 
						|
      description =
 | 
						|
        descriptionMeta?.getAttribute("content") ||
 | 
						|
        ogDescriptionMeta?.getAttribute("content") ||
 | 
						|
        "";
 | 
						|
 | 
						|
      // Try multiple content extraction strategies
 | 
						|
      const contentSelectors = [
 | 
						|
        // Common article selectors
 | 
						|
        "article",
 | 
						|
        '[role="main"]',
 | 
						|
        ".article-content",
 | 
						|
        ".post-content",
 | 
						|
        ".entry-content",
 | 
						|
        ".content",
 | 
						|
        ".main-content",
 | 
						|
        ".article-body",
 | 
						|
        ".post-body",
 | 
						|
        ".story-body",
 | 
						|
        ".news-content",
 | 
						|
 | 
						|
        // Japanese news site specific selectors
 | 
						|
        ".article",
 | 
						|
        ".news-article",
 | 
						|
        ".post",
 | 
						|
        ".entry",
 | 
						|
        "#content",
 | 
						|
        "#main",
 | 
						|
        ".main",
 | 
						|
 | 
						|
        // Fallback to common containers
 | 
						|
        ".container",
 | 
						|
        "#container",
 | 
						|
        "main",
 | 
						|
        "body",
 | 
						|
      ];
 | 
						|
 | 
						|
      for (const selector of contentSelectors) {
 | 
						|
        const element = document.querySelector(selector);
 | 
						|
        if (element) {
 | 
						|
          // Get text content and clean it up
 | 
						|
          let extractedText = element.textContent?.trim() || "";
 | 
						|
 | 
						|
          // Remove extra whitespace and normalize
 | 
						|
          extractedText = extractedText
 | 
						|
            .replace(/\s+/g, " ")
 | 
						|
            .replace(/\n\s*\n/g, "\n")
 | 
						|
            .trim();
 | 
						|
 | 
						|
          // Only use if we found substantial content
 | 
						|
          if (extractedText.length > 200) {
 | 
						|
            content = extractedText;
 | 
						|
            break;
 | 
						|
          }
 | 
						|
        }
 | 
						|
      }
 | 
						|
 | 
						|
      // If still no content, try paragraph extraction
 | 
						|
      if (!content) {
 | 
						|
        const paragraphs = Array.from(document.querySelectorAll("p"))
 | 
						|
          .map((p) => p.textContent?.trim() || "")
 | 
						|
          .filter((p) => p.length > 50); // Filter out short paragraphs
 | 
						|
        content = paragraphs.join("\n\n");
 | 
						|
      }
 | 
						|
 | 
						|
      // Final fallback: use body text
 | 
						|
      if (!content || content.length < 100) {
 | 
						|
        const bodyText = document.body?.textContent || "";
 | 
						|
        content = bodyText.replace(/\s+/g, " ").trim();
 | 
						|
      }
 | 
						|
 | 
						|
      return { title, content, description };
 | 
						|
    });
 | 
						|
 | 
						|
    // Validate extracted content
 | 
						|
    if (!extractedData.content || extractedData.content.length < 50) {
 | 
						|
      return {
 | 
						|
        title: extractedData.title,
 | 
						|
        content: "",
 | 
						|
        description: extractedData.description,
 | 
						|
        success: false,
 | 
						|
        error: "Insufficient content extracted",
 | 
						|
      };
 | 
						|
    }
 | 
						|
 | 
						|
    // Limit content length to avoid token limits
 | 
						|
    const maxLength = 50000;
 | 
						|
    let content = extractedData.content;
 | 
						|
    if (content.length > maxLength) {
 | 
						|
      content = content.substring(0, maxLength) + "...";
 | 
						|
    }
 | 
						|
 | 
						|
    return {
 | 
						|
      title: extractedData.title,
 | 
						|
      content,
 | 
						|
      description: extractedData.description,
 | 
						|
      success: true,
 | 
						|
    };
 | 
						|
  } catch (error) {
 | 
						|
    return {
 | 
						|
      title: "",
 | 
						|
      content: "",
 | 
						|
      description: "",
 | 
						|
      success: false,
 | 
						|
      error: error instanceof Error ? error.message : "Unknown error occurred",
 | 
						|
    };
 | 
						|
  } finally {
 | 
						|
    if (page) {
 | 
						|
      await page.close();
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
export async function enhanceArticleContent(
 | 
						|
  _originalTitle: string,
 | 
						|
  originalLink: string,
 | 
						|
  originalContent?: string,
 | 
						|
  originalDescription?: string,
 | 
						|
): Promise<{ content?: string; description?: string }> {
 | 
						|
  // If we already have substantial content, use it
 | 
						|
  const existingContent = originalContent || originalDescription || "";
 | 
						|
  if (existingContent.length > 500) {
 | 
						|
    return {
 | 
						|
      content: originalContent,
 | 
						|
      description: originalDescription,
 | 
						|
    };
 | 
						|
  }
 | 
						|
 | 
						|
  // Try to extract content from the URL
 | 
						|
  const extracted = await extractArticleContent(originalLink);
 | 
						|
 | 
						|
  if (extracted.success && extracted.content) {
 | 
						|
    return {
 | 
						|
      content: extracted.content,
 | 
						|
      description: extracted.description || originalDescription,
 | 
						|
    };
 | 
						|
  }
 | 
						|
 | 
						|
  // Return original content if extraction failed
 | 
						|
  return {
 | 
						|
    content: originalContent,
 | 
						|
    description: originalDescription,
 | 
						|
  };
 | 
						|
}
 |