VoiceRSSSummary/services/text-converter.ts

import Kuroshiro from "kuroshiro";
import KuroshiroAnalyzerMecab from "kuroshiro-analyzer-mecab";
import { toKatakana } from "wanakana";
import {
  convertEnglishToKatakanaWithJMdict,
  getJMdictInfo,
  initializeJMdict,
  isJMdictInitialized,
} from "./jmdict.js";

// Global instance to avoid recreating the analyzer
let kuroshiroInstance: Kuroshiro | null = null;

// Basic English to Katakana mapping for common words
const englishToKatakanaMap: Record<string, string> = {
  hello: "ハロー",
  world: "ワールド",
  this: "ディス",
  is: "イズ",
  a: "ア",
  test: "テスト",
  javascript: "ジャバスクリプト",
  typescript: "タイプスクリプト",
  and: "アンド",
  api: "エーピーアイ",
  endpoint: "エンドポイント",
  machine: "マシン",
  learning: "ラーニング",
  model: "モデル",
  analysis: "アナリシス",
  computer: "コンピューター",
  data: "データ",
  software: "ソフトウェア",
  program: "プログラム",
  system: "システム",
  network: "ネットワーク",
  server: "サーバー",
  client: "クライアント",
  database: "データベース",
  file: "ファイル",
  user: "ユーザー",
  password: "パスワード",
  login: "ログイン",
  logout: "ログアウト",
  website: "ウェブサイト",
  browser: "ブラウザー",
  application: "アプリケーション",
  service: "サービス",
};

/**
 * Convert English word to Katakana using JMdict, predefined mapping, or phonetic approximation
 */
async function convertEnglishWordToKatakana(word: string): Promise<string> {
  const lowerWord = word.toLowerCase();

  // First try JMdict if available
  try {
    if (isJMdictInitialized()) {
      const jmdictResult = await convertEnglishToKatakanaWithJMdict(word);
      if (jmdictResult && jmdictResult !== word) {
        return jmdictResult;
      }
    }
  } catch (error) {
    console.warn(`JMdict conversion failed for "${word}":`, error);
  }

  // Check predefined mapping second
  if (englishToKatakanaMap[lowerWord]) {
    return englishToKatakanaMap[lowerWord];
  }

  // Try using wanakana for romanized pronunciation
  try {
    // Convert to a rough romanized version and then to katakana
    const katakana = toKatakana(word.toLowerCase());
    if (katakana && katakana !== word.toLowerCase()) {
      return katakana;
    }
  } catch {
    // Fallback if wanakana fails
  }

  // Fallback: simple phonetic approximation
  return approximateEnglishToKatakana(word);
}

/**
 * Simple phonetic approximation for English to Katakana
 */
function approximateEnglishToKatakana(word: string): string {
  const phoneticMap: Record<string, string> = {
    a: "ア",
    b: "ブ",
    c: "ク",
    d: "ド",
    e: "エ",
    f: "フ",
    g: "グ",
    h: "ハ",
    i: "イ",
    j: "ジ",
    k: "ク",
    l: "ル",
    m: "ム",
    n: "ン",
    o: "オ",
    p: "プ",
    q: "ク",
    r: "ル",
    s: "ス",
    t: "ト",
    u: "ウ",
    v: "ブ",
    w: "ワ",
    x: "クス",
    y: "ワイ",
    z: "ズ",
  };

  return word
    .toLowerCase()
    .split("")
    .map((char) => phoneticMap[char] || char)
    .join("");
}

/**
 * Initialize kuroshiro with MeCab analyzer and JMdict
 * This should be called once during application startup
 */
export async function initializeTextConverter(): Promise<void> {
  if (kuroshiroInstance && isJMdictInitialized()) {
    return; // Already initialized
  }

  try {
    console.log("テキストコンバーターを初期化中...");

    // Initialize Kuroshiro if not already done
    if (!kuroshiroInstance) {
      console.log("Kuroshiroを初期化中...");
      kuroshiroInstance = new Kuroshiro();
      await kuroshiroInstance.init(new KuroshiroAnalyzerMecab());
      console.log("Kuroshiro初期化完了");
    }

    // Initialize JMdict if not already done
    if (!isJMdictInitialized()) {
      console.log("JMdictを初期化中...");
      await initializeJMdict();
      console.log("JMdict初期化完了");

      const jmdictInfo = getJMdictInfo();
      if (jmdictInfo) {
        console.log(
          `JMdict情報: バージョン ${jmdictInfo.version}, 辞書日付 ${jmdictInfo.dictDate}`,
        );
      }
    }

    console.log("テキストコンバーター初期化完了");
  } catch (error) {
    console.error("テキストコンバーターの初期化に失敗しました:", error);
    throw error;
  }
}

/**
 * Convert English words and mixed text to katakana
 * @param text - Input text (may contain Japanese, English, and other characters)
 * @returns Text with English words converted to katakana
 */
export async function convertEnglishToKatakana(text: string): Promise<string> {
  if (!kuroshiroInstance) {
    await initializeTextConverter();
  }

  if (!kuroshiroInstance) {
    throw new Error("Failed to initialize kuroshiro");
  }

  try {
    // Convert the entire text to katakana
    // This will convert both Japanese hiragana and English words to katakana
    const convertedText = await kuroshiroInstance.convert(text, {
      to: "katakana",
      mode: "normal",
    });

    return convertedText;
  } catch (error) {
    console.error("テキスト変換エラー:", error);
    // Return original text if conversion fails
    return text;
  }
}

/**
 * Convert only English words to katakana while preserving Japanese text
 * This is a more selective approach that tries to preserve Japanese characters
 * @param text - Input text
 * @returns Text with only English words converted to katakana
 */
export async function convertEnglishWordsOnly(text: string): Promise<string> {
  if (!kuroshiroInstance) {
    await initializeTextConverter();
  }

  if (!kuroshiroInstance) {
    throw new Error("Failed to initialize kuroshiro");
  }

  try {
    // Extract English words using regex
    const englishWordPattern = /\b[a-zA-Z]+\b/g;
    let result = text;

    // Find all English words
    const matches = text.match(englishWordPattern);

    if (matches) {
      for (const englishWord of matches) {
        try {
          // Convert each English word to katakana using our custom function
          const converted = await convertEnglishWordToKatakana(englishWord);

          // Replace the English word with its katakana equivalent
          // Use word boundary to avoid partial replacements
          const wordRegex = new RegExp(
            `\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
            "g",
          );
          result = result.replace(wordRegex, converted);
        } catch (convertError) {
          console.warn(
            `Failed to convert word "${englishWord}":`,
            convertError,
          );
          // Keep original word if conversion fails
        }
      }
    }

    return result;
  } catch (error) {
    console.error("選択的テキスト変換エラー:", error);
    // Fallback to full conversion
    return convertEnglishToKatakana(text);
  }
}

/**
 * Check if text converter (kuroshiro and JMdict) is fully initialized
 */
export function isTextConverterInitialized(): boolean {
  return kuroshiroInstance !== null && isJMdictInitialized();
}

/**
 * Check if only kuroshiro is initialized (backward compatibility)
 */
export function isKuroshiroInitialized(): boolean {
  return kuroshiroInstance !== null;
}

/**
 * Get text converter status information
 */
export function getTextConverterInfo(): {
  kuroshiro: boolean;
  jmdict: boolean;
  jmdictInfo: { dictDate: string; version: string } | null;
} {
  return {
    kuroshiro: kuroshiroInstance !== null,
    jmdict: isJMdictInitialized(),
    jmdictInfo: getJMdictInfo(),
  };
}

/**
 * Convert English words to Katakana using JMdict with enhanced fallback
 * This is the main function that leverages JMdict for accurate conversions
 * @param text - Input text containing English words
 * @returns Text with English words converted to Katakana using JMdict
 */
export async function convertEnglishToKatakanaWithJMdictFallback(
  text: string,
): Promise<string> {
  if (!isJMdictInitialized()) {
    await initializeJMdict();
  }

  try {
    // Extract English words using regex
    const englishWordPattern = /\b[a-zA-Z]+\b/g;
    let result = text;

    // Find all English words
    const matches = text.match(englishWordPattern);

    if (matches) {
      // Process each unique word to avoid duplicate conversions
      const uniqueWords = [...new Set(matches)];

      for (const englishWord of uniqueWords) {
        try {
          // Convert using JMdict-enhanced function
          const converted =
            await convertEnglishToKatakanaWithJMdict(englishWord);

          // Replace all occurrences of this English word with its katakana equivalent
          const wordRegex = new RegExp(
            `\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
            "g",
          );
          result = result.replace(wordRegex, converted);
        } catch (convertError) {
          console.warn(
            `Failed to convert word "${englishWord}":`,
            convertError,
          );
          // Keep original word if conversion fails
        }
      }
    }

    return result;
  } catch (error) {
    console.error("JMdict-based English to Katakana conversion error:", error);
    // Fallback to the original method if JMdict conversion fails
    return convertEnglishWordsOnly(text);
  }
}