VoiceRSSSummary/services/jmdict.ts

import { promises as fs } from "fs";
import path from "path";
import { type SetupType, readingAnywhere, setup } from "jmdict-simplified-node";

// Global JMdict database instance
let jmdictDb: SetupType | null = null;
let isInitializing = false;

const JMDICT_DB_PATH = path.join(process.cwd(), "data", "jmdict-db");
const JMDICT_DATA_URL =
  "https://github.com/scriptin/jmdict-simplified/releases/download/3.1.0/jmdict-eng-3.1.0.json.gz";

/**
 * Initialize JMdict database
 * Downloads and sets up the JMdict database if it doesn't exist
 */
export async function initializeJMdict(): Promise<void> {
  if (jmdictDb) {
    return; // Already initialized
  }

  if (isInitializing) {
    // Wait for ongoing initialization
    while (isInitializing) {
      await new Promise((resolve) => setTimeout(resolve, 100));
    }
    return;
  }

  isInitializing = true;

  try {
    console.log("JMdict データベースを初期化中...");

    // Ensure data directory exists
    const dataDir = path.dirname(JMDICT_DB_PATH);
    await fs.mkdir(dataDir, { recursive: true });

    // Try to load existing database
    try {
      jmdictDb = await setup(JMDICT_DB_PATH);
      console.log(
        `JMdict データベース読み込み完了 (辞書日付: ${jmdictDb.dictDate})`,
      );
      return;
    } catch (error) {
      console.log(
        "既存のJMdictデータベースが見つかりません。新規作成します...",
      );
    }

    // Check if we have the JSON file locally
    const jsonPath = path.join(dataDir, "jmdict-eng-3.1.0.json");
    let jsonExists = false;

    try {
      await fs.access(jsonPath);
      jsonExists = true;
    } catch {
      console.log(
        "JMdict JSONファイルが見つかりません。ダウンロードが必要です。",
      );
      console.log(`手動でダウンロードしてください: ${JMDICT_DATA_URL}`);
      console.log(
        `ダウンロード後、解凍して以下のパスに配置してください: ${jsonPath}`,
      );

      // For now, we'll create a minimal database with some common words
      await createMinimalJMdictDatabase();
      return;
    }

    if (jsonExists) {
      console.log("JMdict JSONファイルを使用してデータベースを作成中...");
      jmdictDb = await setup(JMDICT_DB_PATH, jsonPath, true);
      console.log(
        `JMdict データベース作成完了 (辞書日付: ${jmdictDb.dictDate})`,
      );
    }
  } catch (error) {
    console.error("JMdictの初期化に失敗しました:", error);
    // Create a minimal fallback database
    await createMinimalJMdictDatabase();
  } finally {
    isInitializing = false;
  }
}

/**
 * Create a minimal JMdict database with common English-Japanese mappings
 * This serves as a fallback when the full JMdict database is not available
 */
async function createMinimalJMdictDatabase(): Promise<void> {
  console.log("最小限のJMdictデータベースを作成中...");

  // Create a mock database setup that uses in-memory mappings
  const mockDb = {
    get: async (key: string, _options?: any) => {
      if (key === "raw/dictDate") return "2024-01-01";
      if (key === "raw/version") return "3.1.0-minimal";
      throw new Error("Key not found");
    },
    createValueStream: () =>
      ({
        on: () => ({}),
      }) as any,
  } as any;

  jmdictDb = {
    db: mockDb,
    dictDate: "2024-01-01",
    version: "3.1.0-minimal",
  };

  console.log("最小限のJMdictデータベース作成完了");
}

/**
 * Search for English words in JMdict and get their katakana readings
 * @param englishWord - English word to search for
 * @returns Array of possible katakana readings
 */
export async function searchEnglishToKatakana(
  englishWord: string,
): Promise<string[]> {
  if (!jmdictDb) {
    await initializeJMdict();
  }

  if (!jmdictDb) {
    return [];
  }

  try {
    // Search for the English word in various ways
    const searchTerms = [
      englishWord.toLowerCase(),
      englishWord.toUpperCase(),
      englishWord.charAt(0).toUpperCase() + englishWord.slice(1).toLowerCase(),
    ];

    const katakanaReadings: Set<string> = new Set();

    for (const term of searchTerms) {
      try {
        // Search by reading (kana) - this might catch loanwords
        const readingResults = await readingAnywhere(jmdictDb.db, term, 10);
        for (const word of readingResults) {
          // Extract katakana readings
          for (const kana of word.kana) {
            if (isKatakana(kana.text)) {
              katakanaReadings.add(kana.text);
            }
          }
        }

        // Also search in glosses (definitions) for English matches
        // This is more complex and would require full text search in sense.gloss
        // For now, we'll implement a basic approach
      } catch (searchError) {
        console.warn(`JMdict search failed for term "${term}":`, searchError);
      }
    }

    return Array.from(katakanaReadings);
  } catch (error) {
    console.error("JMdict英語→カタカナ変換エラー:", error);
    return [];
  }
}

/**
 * Check if a string contains katakana characters
 */
function isKatakana(text: string): boolean {
  return /[\u30A0-\u30FF]/.test(text);
}

/**
 * Enhanced English to Katakana conversion using JMdict + fallback methods
 * @param englishWord - English word to convert
 * @returns Most appropriate katakana conversion
 */
export async function convertEnglishToKatakanaWithJMdict(
  englishWord: string,
): Promise<string> {
  // First try JMdict
  const jmdictResults = await searchEnglishToKatakana(englishWord);

  if (jmdictResults.length > 0) {
    // Return the first (most common) result
    return jmdictResults[0];
  }

  // Fallback to enhanced phonetic conversion
  return convertEnglishToKatakanaPhonetic(englishWord);
}

/**
 * Enhanced phonetic English to Katakana conversion
 * This is more sophisticated than the basic mapping in text-converter.ts
 */
function convertEnglishToKatakanaPhonetic(word: string): string {
  const lowerWord = word.toLowerCase();

  // Enhanced common word mappings
  const commonWords: Record<string, string> = {
    // Technology
    computer: "コンピューター",
    software: "ソフトウェア",
    hardware: "ハードウェア",
    internet: "インターネット",
    website: "ウェブサイト",
    email: "イーメール",
    digital: "デジタル",
    technology: "テクノロジー",
    programming: "プログラミング",
    algorithm: "アルゴリズム",
    database: "データベース",
    server: "サーバー",
    client: "クライアント",
    network: "ネットワーク",
    security: "セキュリティ",
    password: "パスワード",
    login: "ログイン",
    logout: "ログアウト",
    download: "ダウンロード",
    upload: "アップロード",

    // Common English words
    hello: "ハロー",
    world: "ワールド",
    news: "ニュース",
    business: "ビジネス",
    service: "サービス",
    system: "システム",
    management: "マネジメント",
    project: "プロジェクト",
    team: "チーム",
    meeting: "ミーティング",
    presentation: "プレゼンテーション",
    report: "レポート",
    analysis: "アナリシス",
    marketing: "マーケティング",
    strategy: "ストラテジー",
    solution: "ソリューション",
    development: "デベロップメント",
    innovation: "イノベーション",
    design: "デザイン",
    product: "プロダクト",
    quality: "クオリティ",
    performance: "パフォーマンス",
    efficiency: "エフィシエンシー",

    // Food and daily life
    coffee: "コーヒー",
    restaurant: "レストラン",
    hotel: "ホテル",
    shopping: "ショッピング",
    fashion: "ファッション",
    music: "ミュージック",
    movie: "ムービー",
    game: "ゲーム",
    sport: "スポーツ",
    travel: "トラベル",
    vacation: "バケーション",
    holiday: "ホリデー",
  };

  if (commonWords[lowerWord]) {
    return commonWords[lowerWord];
  }

  // Enhanced phonetic mapping rules
  let result = "";
  let i = 0;

  while (i < lowerWord.length) {
    const char = lowerWord[i];
    const nextChar = i + 1 < lowerWord.length ? lowerWord[i + 1] : "";

    // Handle common English phonetic patterns
    if (char === "c" && nextChar === "h") {
      result += "チ";
      i += 2;
    } else if (char === "s" && nextChar === "h") {
      result += "シ";
      i += 2;
    } else if (char === "t" && nextChar === "h") {
      result += "ス";
      i += 2;
    } else if (char === "p" && nextChar === "h") {
      result += "フ";
      i += 2;
    } else if (char === "c" && nextChar === "k") {
      result += "ク";
      i += 2;
    } else if (char === "n" && nextChar === "g") {
      result += "ング";
      i += 2;
    } else if (char === "q" && nextChar === "u") {
      result += "クワ";
      i += 2;
    } else {
      // Single character mapping
      const phoneticMap: Record<string, string> = {
        a: "ア",
        e: "エ",
        i: "イ",
        o: "オ",
        u: "ウ",
        b: "ブ",
        c: "ク",
        d: "ド",
        f: "フ",
        g: "グ",
        h: "ハ",
        j: "ジ",
        k: "ク",
        l: "ル",
        m: "ム",
        n: "ン",
        p: "プ",
        r: "ル",
        s: "ス",
        t: "ト",
        v: "ブ",
        w: "ワ",
        x: "クス",
        y: "ワイ",
        z: "ズ",
      };

      result += phoneticMap[char] ?? char;
      i += 1;
    }
  }

  return result;
}

/**
 * Check if JMdict is initialized and available
 */
export function isJMdictInitialized(): boolean {
  return jmdictDb !== null;
}

/**
 * Get JMdict database information
 */
export function getJMdictInfo(): { dictDate: string; version: string } | null {
  if (!jmdictDb) {
    return null;
  }

  return {
    dictDate: jmdictDb.dictDate,
    version: jmdictDb.version,
  };
}