Files
VoiceRSSSummary/services/text-converter.ts

337 lines
9.2 KiB
TypeScript

import Kuroshiro from "kuroshiro";
import KuroshiroAnalyzerMecab from "kuroshiro-analyzer-mecab";
import { toKatakana } from "wanakana";
import {
convertEnglishToKatakanaWithJMdict,
getJMdictInfo,
initializeJMdict,
isJMdictInitialized,
} from "./jmdict.js";
// Global instance to avoid recreating the analyzer
let kuroshiroInstance: Kuroshiro | null = null;
// Basic English to Katakana mapping for common words
const englishToKatakanaMap: Record<string, string> = {
hello: "ハロー",
world: "ワールド",
this: "ディス",
is: "イズ",
a: "ア",
test: "テスト",
javascript: "ジャバスクリプト",
typescript: "タイプスクリプト",
and: "アンド",
api: "エーピーアイ",
endpoint: "エンドポイント",
machine: "マシン",
learning: "ラーニング",
model: "モデル",
analysis: "アナリシス",
computer: "コンピューター",
data: "データ",
software: "ソフトウェア",
program: "プログラム",
system: "システム",
network: "ネットワーク",
server: "サーバー",
client: "クライアント",
database: "データベース",
file: "ファイル",
user: "ユーザー",
password: "パスワード",
login: "ログイン",
logout: "ログアウト",
website: "ウェブサイト",
browser: "ブラウザー",
application: "アプリケーション",
service: "サービス",
};
/**
* Convert English word to Katakana using JMdict, predefined mapping, or phonetic approximation
*/
async function convertEnglishWordToKatakana(word: string): Promise<string> {
const lowerWord = word.toLowerCase();
// First try JMdict if available
try {
if (isJMdictInitialized()) {
const jmdictResult = await convertEnglishToKatakanaWithJMdict(word);
if (jmdictResult && jmdictResult !== word) {
return jmdictResult;
}
}
} catch (error) {
console.warn(`JMdict conversion failed for "${word}":`, error);
}
// Check predefined mapping second
if (englishToKatakanaMap[lowerWord]) {
return englishToKatakanaMap[lowerWord];
}
// Try using wanakana for romanized pronunciation
try {
// Convert to a rough romanized version and then to katakana
const katakana = toKatakana(word.toLowerCase());
if (katakana && katakana !== word.toLowerCase()) {
return katakana;
}
} catch {
// Fallback if wanakana fails
}
// Fallback: simple phonetic approximation
return approximateEnglishToKatakana(word);
}
/**
* Simple phonetic approximation for English to Katakana
*/
function approximateEnglishToKatakana(word: string): string {
const phoneticMap: Record<string, string> = {
a: "ア",
b: "ブ",
c: "ク",
d: "ド",
e: "エ",
f: "フ",
g: "グ",
h: "ハ",
i: "イ",
j: "ジ",
k: "ク",
l: "ル",
m: "ム",
n: "ン",
o: "オ",
p: "プ",
q: "ク",
r: "ル",
s: "ス",
t: "ト",
u: "ウ",
v: "ブ",
w: "ワ",
x: "クス",
y: "ワイ",
z: "ズ",
};
return word
.toLowerCase()
.split("")
.map((char) => phoneticMap[char] || char)
.join("");
}
/**
* Initialize kuroshiro with MeCab analyzer and JMdict
* This should be called once during application startup
*/
export async function initializeTextConverter(): Promise<void> {
if (kuroshiroInstance && isJMdictInitialized()) {
return; // Already initialized
}
try {
console.log("テキストコンバーターを初期化中...");
// Initialize Kuroshiro if not already done
if (!kuroshiroInstance) {
console.log("Kuroshiroを初期化中...");
kuroshiroInstance = new Kuroshiro();
await kuroshiroInstance.init(new KuroshiroAnalyzerMecab());
console.log("Kuroshiro初期化完了");
}
// Initialize JMdict if not already done
if (!isJMdictInitialized()) {
console.log("JMdictを初期化中...");
await initializeJMdict();
console.log("JMdict初期化完了");
const jmdictInfo = getJMdictInfo();
if (jmdictInfo) {
console.log(
`JMdict情報: バージョン ${jmdictInfo.version}, 辞書日付 ${jmdictInfo.dictDate}`,
);
}
}
console.log("テキストコンバーター初期化完了");
} catch (error) {
console.error("テキストコンバーターの初期化に失敗しました:", error);
throw error;
}
}
/**
* Convert English words and mixed text to katakana
* @param text - Input text (may contain Japanese, English, and other characters)
* @returns Text with English words converted to katakana
*/
export async function convertEnglishToKatakana(text: string): Promise<string> {
if (!kuroshiroInstance) {
await initializeTextConverter();
}
if (!kuroshiroInstance) {
throw new Error("Failed to initialize kuroshiro");
}
try {
// Convert the entire text to katakana
// This will convert both Japanese hiragana and English words to katakana
const convertedText = await kuroshiroInstance.convert(text, {
to: "katakana",
mode: "normal",
});
return convertedText;
} catch (error) {
console.error("テキスト変換エラー:", error);
// Return original text if conversion fails
return text;
}
}
/**
* Convert only English words to katakana while preserving Japanese text
* This is a more selective approach that tries to preserve Japanese characters
* @param text - Input text
* @returns Text with only English words converted to katakana
*/
export async function convertEnglishWordsOnly(text: string): Promise<string> {
if (!kuroshiroInstance) {
await initializeTextConverter();
}
if (!kuroshiroInstance) {
throw new Error("Failed to initialize kuroshiro");
}
try {
// Extract English words using regex
const englishWordPattern = /\b[a-zA-Z]+\b/g;
let result = text;
// Find all English words
const matches = text.match(englishWordPattern);
if (matches) {
for (const englishWord of matches) {
try {
// Convert each English word to katakana using our custom function
const converted = await convertEnglishWordToKatakana(englishWord);
// Replace the English word with its katakana equivalent
// Use word boundary to avoid partial replacements
const wordRegex = new RegExp(
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
"g",
);
result = result.replace(wordRegex, converted);
} catch (convertError) {
console.warn(
`Failed to convert word "${englishWord}":`,
convertError,
);
// Keep original word if conversion fails
}
}
}
return result;
} catch (error) {
console.error("選択的テキスト変換エラー:", error);
// Fallback to full conversion
return convertEnglishToKatakana(text);
}
}
/**
* Check if text converter (kuroshiro and JMdict) is fully initialized
*/
export function isTextConverterInitialized(): boolean {
return kuroshiroInstance !== null && isJMdictInitialized();
}
/**
* Check if only kuroshiro is initialized (backward compatibility)
*/
export function isKuroshiroInitialized(): boolean {
return kuroshiroInstance !== null;
}
/**
* Get text converter status information
*/
export function getTextConverterInfo(): {
kuroshiro: boolean;
jmdict: boolean;
jmdictInfo: { dictDate: string; version: string } | null;
} {
return {
kuroshiro: kuroshiroInstance !== null,
jmdict: isJMdictInitialized(),
jmdictInfo: getJMdictInfo(),
};
}
/**
* Convert English words to Katakana using JMdict with enhanced fallback
* This is the main function that leverages JMdict for accurate conversions
* @param text - Input text containing English words
* @returns Text with English words converted to Katakana using JMdict
*/
export async function convertEnglishToKatakanaWithJMdictFallback(
text: string,
): Promise<string> {
if (!isJMdictInitialized()) {
await initializeJMdict();
}
try {
// Extract English words using regex
const englishWordPattern = /\b[a-zA-Z]+\b/g;
let result = text;
// Find all English words
const matches = text.match(englishWordPattern);
if (matches) {
// Process each unique word to avoid duplicate conversions
const uniqueWords = [...new Set(matches)];
for (const englishWord of uniqueWords) {
try {
// Convert using JMdict-enhanced function
const converted =
await convertEnglishToKatakanaWithJMdict(englishWord);
// Replace all occurrences of this English word with its katakana equivalent
const wordRegex = new RegExp(
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
"g",
);
result = result.replace(wordRegex, converted);
} catch (convertError) {
console.warn(
`Failed to convert word "${englishWord}":`,
convertError,
);
// Keep original word if conversion fails
}
}
}
return result;
} catch (error) {
console.error("JMdict-based English to Katakana conversion error:", error);
// Fallback to the original method if JMdict conversion fails
return convertEnglishWordsOnly(text);
}
}