328 lines
8.9 KiB
TypeScript
328 lines
8.9 KiB
TypeScript
import Kuroshiro from "kuroshiro";
|
|
import KuroshiroAnalyzerMecab from "kuroshiro-analyzer-mecab";
|
|
import { toKatakana } from "wanakana";
|
|
import {
|
|
convertEnglishToKatakanaWithJMdict,
|
|
getJMdictInfo,
|
|
initializeJMdict,
|
|
isJMdictInitialized,
|
|
} from "./jmdict.js";
|
|
|
|
// Global instance to avoid recreating the analyzer
|
|
let kuroshiroInstance: Kuroshiro | null = null;
|
|
|
|
// Basic English to Katakana mapping for common words
|
|
const englishToKatakanaMap: Record<string, string> = {
|
|
hello: "ハロー",
|
|
world: "ワールド",
|
|
this: "ディス",
|
|
is: "イズ",
|
|
a: "ア",
|
|
test: "テスト",
|
|
javascript: "ジャバスクリプト",
|
|
typescript: "タイプスクリプト",
|
|
and: "アンド",
|
|
api: "エーピーアイ",
|
|
endpoint: "エンドポイント",
|
|
machine: "マシン",
|
|
learning: "ラーニング",
|
|
model: "モデル",
|
|
analysis: "アナリシス",
|
|
computer: "コンピューター",
|
|
data: "データ",
|
|
software: "ソフトウェア",
|
|
program: "プログラム",
|
|
system: "システム",
|
|
network: "ネットワーク",
|
|
server: "サーバー",
|
|
client: "クライアント",
|
|
database: "データベース",
|
|
file: "ファイル",
|
|
user: "ユーザー",
|
|
password: "パスワード",
|
|
login: "ログイン",
|
|
logout: "ログアウト",
|
|
website: "ウェブサイト",
|
|
browser: "ブラウザー",
|
|
application: "アプリケーション",
|
|
service: "サービス",
|
|
};
|
|
|
|
/**
|
|
* Convert English word to Katakana using JMdict, predefined mapping, or phonetic approximation
|
|
*/
|
|
async function convertEnglishWordToKatakana(word: string): Promise<string> {
|
|
const lowerWord = word.toLowerCase();
|
|
|
|
// First try JMdict if available
|
|
try {
|
|
if (isJMdictInitialized()) {
|
|
const jmdictResult = await convertEnglishToKatakanaWithJMdict(word);
|
|
if (jmdictResult && jmdictResult !== word) {
|
|
return jmdictResult;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.warn(`JMdict conversion failed for "${word}":`, error);
|
|
}
|
|
|
|
// Check predefined mapping second
|
|
if (englishToKatakanaMap[lowerWord]) {
|
|
return englishToKatakanaMap[lowerWord];
|
|
}
|
|
|
|
console.warn(
|
|
`No conversion found for "${word}", using phonetic approximation.`,
|
|
);
|
|
return word;
|
|
}
|
|
|
|
/**
|
|
* Simple phonetic approximation for English to Katakana
|
|
*/
|
|
function approximateEnglishToKatakana(word: string): string {
|
|
const phoneticMap: Record<string, string> = {
|
|
a: "ア",
|
|
b: "ブ",
|
|
c: "ク",
|
|
d: "ド",
|
|
e: "エ",
|
|
f: "フ",
|
|
g: "グ",
|
|
h: "ハ",
|
|
i: "イ",
|
|
j: "ジ",
|
|
k: "ク",
|
|
l: "ル",
|
|
m: "ム",
|
|
n: "ン",
|
|
o: "オ",
|
|
p: "プ",
|
|
q: "ク",
|
|
r: "ル",
|
|
s: "ス",
|
|
t: "ト",
|
|
u: "ウ",
|
|
v: "ブ",
|
|
w: "ワ",
|
|
x: "クス",
|
|
y: "ワイ",
|
|
z: "ズ",
|
|
};
|
|
|
|
return word
|
|
.toLowerCase()
|
|
.split("")
|
|
.map((char) => phoneticMap[char] || char)
|
|
.join("");
|
|
}
|
|
|
|
/**
|
|
* Initialize kuroshiro with MeCab analyzer and JMdict
|
|
* This should be called once during application startup
|
|
*/
|
|
export async function initializeTextConverter(): Promise<void> {
|
|
if (kuroshiroInstance && isJMdictInitialized()) {
|
|
return; // Already initialized
|
|
}
|
|
|
|
try {
|
|
console.log("テキストコンバーターを初期化中...");
|
|
|
|
// Initialize Kuroshiro if not already done
|
|
if (!kuroshiroInstance) {
|
|
console.log("Kuroshiroを初期化中...");
|
|
kuroshiroInstance = new Kuroshiro();
|
|
await kuroshiroInstance.init(new KuroshiroAnalyzerMecab());
|
|
console.log("Kuroshiro初期化完了");
|
|
}
|
|
|
|
// Initialize JMdict if not already done
|
|
if (!isJMdictInitialized()) {
|
|
console.log("JMdictを初期化中...");
|
|
await initializeJMdict();
|
|
console.log("JMdict初期化完了");
|
|
|
|
const jmdictInfo = getJMdictInfo();
|
|
if (jmdictInfo) {
|
|
console.log(
|
|
`JMdict情報: バージョン ${jmdictInfo.version}, 辞書日付 ${jmdictInfo.dictDate}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
console.log("テキストコンバーター初期化完了");
|
|
} catch (error) {
|
|
console.error("テキストコンバーターの初期化に失敗しました:", error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert English words and mixed text to katakana
|
|
* @param text - Input text (may contain Japanese, English, and other characters)
|
|
* @returns Text with English words converted to katakana
|
|
*/
|
|
export async function convertEnglishToKatakana(text: string): Promise<string> {
|
|
if (!kuroshiroInstance) {
|
|
await initializeTextConverter();
|
|
}
|
|
|
|
if (!kuroshiroInstance) {
|
|
throw new Error("Failed to initialize kuroshiro");
|
|
}
|
|
|
|
try {
|
|
// Convert the entire text to katakana
|
|
// This will convert both Japanese hiragana and English words to katakana
|
|
const convertedText = await kuroshiroInstance.convert(text, {
|
|
to: "katakana",
|
|
mode: "normal",
|
|
});
|
|
|
|
return convertedText;
|
|
} catch (error) {
|
|
console.error("テキスト変換エラー:", error);
|
|
// Return original text if conversion fails
|
|
return text;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert only English words to katakana while preserving Japanese text
|
|
* This is a more selective approach that tries to preserve Japanese characters
|
|
* @param text - Input text
|
|
* @returns Text with only English words converted to katakana
|
|
*/
|
|
export async function convertEnglishWordsOnly(text: string): Promise<string> {
|
|
if (!kuroshiroInstance) {
|
|
await initializeTextConverter();
|
|
}
|
|
|
|
if (!kuroshiroInstance) {
|
|
throw new Error("Failed to initialize kuroshiro");
|
|
}
|
|
|
|
try {
|
|
// Extract English words using regex
|
|
const englishWordPattern = /\b[a-zA-Z]+\b/g;
|
|
let result = text;
|
|
|
|
// Find all English words
|
|
const matches = text.match(englishWordPattern);
|
|
|
|
if (matches) {
|
|
for (const englishWord of matches) {
|
|
try {
|
|
// Convert each English word to katakana using our custom function
|
|
const converted = await convertEnglishWordToKatakana(englishWord);
|
|
|
|
// Replace the English word with its katakana equivalent
|
|
// Use word boundary to avoid partial replacements
|
|
const wordRegex = new RegExp(
|
|
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
|
|
"g",
|
|
);
|
|
result = result.replace(wordRegex, converted);
|
|
} catch (convertError) {
|
|
console.warn(
|
|
`Failed to convert word "${englishWord}":`,
|
|
convertError,
|
|
);
|
|
// Keep original word if conversion fails
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
} catch (error) {
|
|
console.error("選択的テキスト変換エラー:", error);
|
|
// Fallback to full conversion
|
|
return convertEnglishToKatakana(text);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if text converter (kuroshiro and JMdict) is fully initialized
|
|
*/
|
|
export function isTextConverterInitialized(): boolean {
|
|
return kuroshiroInstance !== null && isJMdictInitialized();
|
|
}
|
|
|
|
/**
|
|
* Check if only kuroshiro is initialized (backward compatibility)
|
|
*/
|
|
export function isKuroshiroInitialized(): boolean {
|
|
return kuroshiroInstance !== null;
|
|
}
|
|
|
|
/**
|
|
* Get text converter status information
|
|
*/
|
|
export function getTextConverterInfo(): {
|
|
kuroshiro: boolean;
|
|
jmdict: boolean;
|
|
jmdictInfo: { dictDate: string; version: string } | null;
|
|
} {
|
|
return {
|
|
kuroshiro: kuroshiroInstance !== null,
|
|
jmdict: isJMdictInitialized(),
|
|
jmdictInfo: getJMdictInfo(),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Convert English words to Katakana using JMdict with enhanced fallback
|
|
* This is the main function that leverages JMdict for accurate conversions
|
|
* @param text - Input text containing English words
|
|
* @returns Text with English words converted to Katakana using JMdict
|
|
*/
|
|
export async function convertEnglishToKatakanaWithJMdictFallback(
|
|
text: string,
|
|
): Promise<string> {
|
|
if (!isJMdictInitialized()) {
|
|
await initializeJMdict();
|
|
}
|
|
|
|
try {
|
|
// Extract English words using regex
|
|
const englishWordPattern = /\b[a-zA-Z]+\b/g;
|
|
let result = text;
|
|
|
|
// Find all English words
|
|
const matches = text.match(englishWordPattern);
|
|
|
|
if (matches) {
|
|
// Process each unique word to avoid duplicate conversions
|
|
const uniqueWords = [...new Set(matches)];
|
|
|
|
for (const englishWord of uniqueWords) {
|
|
try {
|
|
// Convert using JMdict-enhanced function
|
|
const converted =
|
|
await convertEnglishToKatakanaWithJMdict(englishWord);
|
|
|
|
// Replace all occurrences of this English word with its katakana equivalent
|
|
const wordRegex = new RegExp(
|
|
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
|
|
"g",
|
|
);
|
|
result = result.replace(wordRegex, converted);
|
|
} catch (convertError) {
|
|
console.warn(
|
|
`Failed to convert word "${englishWord}":`,
|
|
convertError,
|
|
);
|
|
// Keep original word if conversion fails
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
} catch (error) {
|
|
console.error("JMdict-based English to Katakana conversion error:", error);
|
|
// Fallback to the original method if JMdict conversion fails
|
|
return convertEnglishWordsOnly(text);
|
|
}
|
|
}
|