Replace cheero with puppeteer

2025-06-08 21:34:11 +09:00
parent 080d47ab01
commit 4aa1b5c56a
6 changed files with 382 additions and 98 deletions
--- a/services/content-extractor.ts
+++ b/services/content-extractor.ts
@ -1,4 +1,4 @@
-import * as cheerio from "cheerio";
+import puppeteer, { type Browser } from "puppeteer";

 export interface ExtractedContent {
  title?: string;
@ -8,126 +8,194 @@ export interface ExtractedContent {
  error?: string;
 }

+// Singleton browser instance for reuse
+let sharedBrowser: Browser | null = null;
+
+async function getBrowser(): Promise<Browser> {
+  if (!sharedBrowser || !sharedBrowser.isConnected()) {
+    sharedBrowser = await puppeteer.launch({
+      headless: true,
+      args: [
+        "--no-sandbox",
+        "--disable-setuid-sandbox",
+        "--disable-dev-shm-usage",
+        "--disable-accelerated-2d-canvas",
+        "--no-first-run",
+        "--no-zygote",
+        "--disable-gpu",
+        "--disable-web-security",
+        "--disable-features=VizDisplayCompositor",
+      ],
+    });
+  }
+  return sharedBrowser;
+}
+
+export async function closeBrowser(): Promise<void> {
+  if (sharedBrowser && sharedBrowser.isConnected()) {
+    await sharedBrowser.close();
+    sharedBrowser = null;
+  }
+}
+
 export async function extractArticleContent(
  url: string,
 ): Promise<ExtractedContent> {
+  let page = null;
  try {
-    // Fetch the HTML content
-    const response = await fetch(url, {
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-        Accept:
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-        "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
-        "Accept-Encoding": "gzip, deflate",
-        Connection: "keep-alive",
-        "Upgrade-Insecure-Requests": "1",
-      },
-      signal: AbortSignal.timeout(30000), // 30 second timeout
+    const browser = await getBrowser();
+    page = await browser.newPage();
+
+    // Set user agent and viewport
+    await page.setUserAgent(
+      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    );
+    await page.setViewport({ width: 1280, height: 720 });
+
+    // Set navigation timeout
+    page.setDefaultNavigationTimeout(30000);
+    page.setDefaultTimeout(30000);
+
+    // Navigate to the page
+    const response = await page.goto(url, {
+      waitUntil: "networkidle2",
+      timeout: 30000,
    });

-    if (!response.ok) {
-      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+    if (!response || !response.ok()) {
+      throw new Error(`HTTP ${response?.status()}: Failed to load page`);
    }

-    const html = await response.text();
-    const $ = cheerio.load(html);
+    // Wait for potential dynamic content
+    await new Promise(resolve => setTimeout(resolve, 2000));

-    // Remove unwanted elements
-    $(
-      "script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
-    ).remove();
+    // Extract content using page.evaluate
+    const extractedData = await page.evaluate(() => {
+      // Remove unwanted elements
+      const unwantedSelectors = [
+        "script",
+        "style",
+        "nav",
+        "header",
+        "footer",
+        "aside",
+        ".advertisement",
+        ".ads",
+        ".sidebar",
+        ".menu",
+        ".navigation",
+        ".social-share",
+        ".comments",
+        ".cookie-banner",
+        ".popup",
+        ".modal",
+      ];

-    let content = "";
-    let title = "";
-    let description = "";
+      unwantedSelectors.forEach((selector) => {
+        const elements = document.querySelectorAll(selector);
+        elements.forEach((el) => el.remove());
+      });

-    // Extract title
-    title =
-      $("title").text().trim() ||
-      $("h1").first().text().trim() ||
-      $('meta[property="og:title"]').attr("content") ||
-      "";
+      let content = "";
+      let title = "";
+      let description = "";

-    // Extract description
-    description =
-      $('meta[name="description"]').attr("content") ||
-      $('meta[property="og:description"]').attr("content") ||
-      "";
+      // Extract title
+      const titleElement = document.querySelector("title");
+      const h1Element = document.querySelector("h1");
+      const ogTitleMeta = document.querySelector('meta[property="og:title"]');

-    // Try multiple content extraction strategies
-    const contentSelectors = [
-      // Common article selectors
-      "article",
-      '[role="main"]',
-      ".article-content",
-      ".post-content",
-      ".entry-content",
-      ".content",
-      ".main-content",
-      ".article-body",
-      ".post-body",
-      ".story-body",
-      ".news-content",
+      title =
+        titleElement?.textContent?.trim() ||
+        h1Element?.textContent?.trim() ||
+        ogTitleMeta?.getAttribute("content") ||
+        "";

-      // Japanese news site specific selectors
-      ".article",
-      ".news-article",
-      ".post",
-      ".entry",
-      "#content",
-      "#main",
-      ".main",
+      // Extract description
+      const descriptionMeta = document.querySelector('meta[name="description"]');
+      const ogDescriptionMeta = document.querySelector(
+        'meta[property="og:description"]',
+      );

-      // Fallback to common containers
-      ".container",
-      "#container",
-      "main",
-      "body",
-    ];
+      description =
+        descriptionMeta?.getAttribute("content") ||
+        ogDescriptionMeta?.getAttribute("content") ||
+        "";

-    for (const selector of contentSelectors) {
-      const element = $(selector);
-      if (element.length > 0) {
-        // Get text content and clean it up
-        let extractedText = element.text().trim();
+      // Try multiple content extraction strategies
+      const contentSelectors = [
+        // Common article selectors
+        "article",
+        '[role="main"]',
+        ".article-content",
+        ".post-content",
+        ".entry-content",
+        ".content",
+        ".main-content",
+        ".article-body",
+        ".post-body",
+        ".story-body",
+        ".news-content",

-        // Remove extra whitespace and normalize
-        extractedText = extractedText
-          .replace(/\s+/g, " ")
-          .replace(/\n\s*\n/g, "\n")
-          .trim();
+        // Japanese news site specific selectors
+        ".article",
+        ".news-article",
+        ".post",
+        ".entry",
+        "#content",
+        "#main",
+        ".main",

-        // Only use if we found substantial content
-        if (extractedText.length > 200) {
-          content = extractedText;
-          break;
+        // Fallback to common containers
+        ".container",
+        "#container",
+        "main",
+        "body",
+      ];
+
+      for (const selector of contentSelectors) {
+        const element = document.querySelector(selector);
+        if (element) {
+          // Get text content and clean it up
+          let extractedText = element.textContent?.trim() || "";
+
+          // Remove extra whitespace and normalize
+          extractedText = extractedText
+            .replace(/\s+/g, " ")
+            .replace(/\n\s*\n/g, "\n")
+            .trim();
+
+          // Only use if we found substantial content
+          if (extractedText.length > 200) {
+            content = extractedText;
+            break;
+          }
        }
      }
-    }

-    // If still no content, try paragraph extraction
-    if (!content) {
-      const paragraphs = $("p")
-        .map((_, el) => $(el).text().trim())
-        .get();
-      content = paragraphs
-        .filter((p) => p.length > 50) // Filter out short paragraphs
-        .join("\n\n");
-    }
+      // If still no content, try paragraph extraction
+      if (!content) {
+        const paragraphs = Array.from(document.querySelectorAll("p"))
+          .map((p) => p.textContent?.trim() || "")
+          .filter((p) => p.length > 50); // Filter out short paragraphs
+        content = paragraphs.join("\n\n");
+      }

-    // Final fallback: use body text
-    if (!content || content.length < 100) {
-      content = $("body").text().replace(/\s+/g, " ").trim();
-    }
+      // Final fallback: use body text
+      if (!content || content.length < 100) {
+        const bodyText = document.body?.textContent || "";
+        content = bodyText.replace(/\s+/g, " ").trim();
+      }
+
+      return { title, content, description };
+    });

    // Validate extracted content
-    if (!content || content.length < 50) {
+    if (!extractedData.content || extractedData.content.length < 50) {
      return {
-        title,
+        title: extractedData.title,
        content: "",
-        description,
+        description: extractedData.description,
        success: false,
        error: "Insufficient content extracted",
      };
@ -135,14 +203,15 @@ export async function extractArticleContent(

    // Limit content length to avoid token limits
    const maxLength = 5000;
+    let content = extractedData.content;
    if (content.length > maxLength) {
      content = content.substring(0, maxLength) + "...";
    }

    return {
-      title,
+      title: extractedData.title,
      content,
-      description,
+      description: extractedData.description,
      success: true,
    };
  } catch (error) {
@ -153,6 +222,10 @@ export async function extractArticleContent(
      success: false,
      error: error instanceof Error ? error.message : "Unknown error occurred",
    };
+  } finally {
+    if (page) {
+      await page.close();
+    }
  }
 }