190 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
			
		
		
	
	
			190 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
import * as cheerio from "cheerio";
 | 
						|
 | 
						|
export interface ExtractedContent {
 | 
						|
  title?: string;
 | 
						|
  content: string;
 | 
						|
  description?: string;
 | 
						|
  success: boolean;
 | 
						|
  error?: string;
 | 
						|
}
 | 
						|
 | 
						|
export async function extractArticleContent(
 | 
						|
  url: string,
 | 
						|
): Promise<ExtractedContent> {
 | 
						|
  try {
 | 
						|
    // Fetch the HTML content
 | 
						|
    const response = await fetch(url, {
 | 
						|
      headers: {
 | 
						|
        "User-Agent":
 | 
						|
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
 | 
						|
        Accept:
 | 
						|
          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 | 
						|
        "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
 | 
						|
        "Accept-Encoding": "gzip, deflate",
 | 
						|
        Connection: "keep-alive",
 | 
						|
        "Upgrade-Insecure-Requests": "1",
 | 
						|
      },
 | 
						|
      signal: AbortSignal.timeout(30000), // 30 second timeout
 | 
						|
    });
 | 
						|
 | 
						|
    if (!response.ok) {
 | 
						|
      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
 | 
						|
    }
 | 
						|
 | 
						|
    const html = await response.text();
 | 
						|
    const $ = cheerio.load(html);
 | 
						|
 | 
						|
    // Remove unwanted elements
 | 
						|
    $(
 | 
						|
      "script, style, nav, header, footer, aside, .advertisement, .ads, .sidebar, .menu, .navigation, .social-share, .comments",
 | 
						|
    ).remove();
 | 
						|
 | 
						|
    let content = "";
 | 
						|
    let title = "";
 | 
						|
    let description = "";
 | 
						|
 | 
						|
    // Extract title
 | 
						|
    title =
 | 
						|
      $("title").text().trim() ||
 | 
						|
      $("h1").first().text().trim() ||
 | 
						|
      $('meta[property="og:title"]').attr("content") ||
 | 
						|
      "";
 | 
						|
 | 
						|
    // Extract description
 | 
						|
    description =
 | 
						|
      $('meta[name="description"]').attr("content") ||
 | 
						|
      $('meta[property="og:description"]').attr("content") ||
 | 
						|
      "";
 | 
						|
 | 
						|
    // Try multiple content extraction strategies
 | 
						|
    const contentSelectors = [
 | 
						|
      // Common article selectors
 | 
						|
      "article",
 | 
						|
      '[role="main"]',
 | 
						|
      ".article-content",
 | 
						|
      ".post-content",
 | 
						|
      ".entry-content",
 | 
						|
      ".content",
 | 
						|
      ".main-content",
 | 
						|
      ".article-body",
 | 
						|
      ".post-body",
 | 
						|
      ".story-body",
 | 
						|
      ".news-content",
 | 
						|
 | 
						|
      // Japanese news site specific selectors
 | 
						|
      ".article",
 | 
						|
      ".news-article",
 | 
						|
      ".post",
 | 
						|
      ".entry",
 | 
						|
      "#content",
 | 
						|
      "#main",
 | 
						|
      ".main",
 | 
						|
 | 
						|
      // Fallback to common containers
 | 
						|
      ".container",
 | 
						|
      "#container",
 | 
						|
      "main",
 | 
						|
      "body",
 | 
						|
    ];
 | 
						|
 | 
						|
    for (const selector of contentSelectors) {
 | 
						|
      const element = $(selector);
 | 
						|
      if (element.length > 0) {
 | 
						|
        // Get text content and clean it up
 | 
						|
        let extractedText = element.text().trim();
 | 
						|
 | 
						|
        // Remove extra whitespace and normalize
 | 
						|
        extractedText = extractedText
 | 
						|
          .replace(/\s+/g, " ")
 | 
						|
          .replace(/\n\s*\n/g, "\n")
 | 
						|
          .trim();
 | 
						|
 | 
						|
        // Only use if we found substantial content
 | 
						|
        if (extractedText.length > 200) {
 | 
						|
          content = extractedText;
 | 
						|
          break;
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    // If still no content, try paragraph extraction
 | 
						|
    if (!content) {
 | 
						|
      const paragraphs = $("p")
 | 
						|
        .map((_, el) => $(el).text().trim())
 | 
						|
        .get();
 | 
						|
      content = paragraphs
 | 
						|
        .filter((p) => p.length > 50) // Filter out short paragraphs
 | 
						|
        .join("\n\n");
 | 
						|
    }
 | 
						|
 | 
						|
    // Final fallback: use body text
 | 
						|
    if (!content || content.length < 100) {
 | 
						|
      content = $("body").text().replace(/\s+/g, " ").trim();
 | 
						|
    }
 | 
						|
 | 
						|
    // Validate extracted content
 | 
						|
    if (!content || content.length < 50) {
 | 
						|
      return {
 | 
						|
        title,
 | 
						|
        content: "",
 | 
						|
        description,
 | 
						|
        success: false,
 | 
						|
        error: "Insufficient content extracted",
 | 
						|
      };
 | 
						|
    }
 | 
						|
 | 
						|
    // Limit content length to avoid token limits
 | 
						|
    const maxLength = 5000;
 | 
						|
    if (content.length > maxLength) {
 | 
						|
      content = content.substring(0, maxLength) + "...";
 | 
						|
    }
 | 
						|
 | 
						|
    return {
 | 
						|
      title,
 | 
						|
      content,
 | 
						|
      description,
 | 
						|
      success: true,
 | 
						|
    };
 | 
						|
  } catch (error) {
 | 
						|
    return {
 | 
						|
      title: "",
 | 
						|
      content: "",
 | 
						|
      description: "",
 | 
						|
      success: false,
 | 
						|
      error: error instanceof Error ? error.message : "Unknown error occurred",
 | 
						|
    };
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
export async function enhanceArticleContent(
 | 
						|
  originalTitle: string,
 | 
						|
  originalLink: string,
 | 
						|
  originalContent?: string,
 | 
						|
  originalDescription?: string,
 | 
						|
): Promise<{ content?: string; description?: string }> {
 | 
						|
  // If we already have substantial content, use it
 | 
						|
  const existingContent = originalContent || originalDescription || "";
 | 
						|
  if (existingContent.length > 500) {
 | 
						|
    return {
 | 
						|
      content: originalContent,
 | 
						|
      description: originalDescription,
 | 
						|
    };
 | 
						|
  }
 | 
						|
 | 
						|
  // Try to extract content from the URL
 | 
						|
  const extracted = await extractArticleContent(originalLink);
 | 
						|
 | 
						|
  if (extracted.success && extracted.content) {
 | 
						|
    return {
 | 
						|
      content: extracted.content,
 | 
						|
      description: extracted.description || originalDescription,
 | 
						|
    };
 | 
						|
  }
 | 
						|
 | 
						|
  // Return original content if extraction failed
 | 
						|
  return {
 | 
						|
    content: originalContent,
 | 
						|
    description: originalDescription,
 | 
						|
  };
 | 
						|
}
 |