Add JMDict Japanese dictionary support
This commit is contained in:
@ -1,58 +1,76 @@
|
||||
import Kuroshiro from "kuroshiro";
|
||||
import KuroshiroAnalyzerMecab from "kuroshiro-analyzer-mecab";
|
||||
import { toKatakana } from "wanakana";
|
||||
import {
|
||||
convertEnglishToKatakanaWithJMdict,
|
||||
getJMdictInfo,
|
||||
initializeJMdict,
|
||||
isJMdictInitialized,
|
||||
} from "./jmdict.js";
|
||||
|
||||
// Global instance to avoid recreating the analyzer
|
||||
let kuroshiroInstance: Kuroshiro | null = null;
|
||||
|
||||
// Basic English to Katakana mapping for common words
|
||||
const englishToKatakanaMap: Record<string, string> = {
|
||||
"hello": "ハロー",
|
||||
"world": "ワールド",
|
||||
"this": "ディス",
|
||||
"is": "イズ",
|
||||
"a": "ア",
|
||||
"test": "テスト",
|
||||
"javascript": "ジャバスクリプト",
|
||||
"typescript": "タイプスクリプト",
|
||||
"and": "アンド",
|
||||
"api": "エーピーアイ",
|
||||
"endpoint": "エンドポイント",
|
||||
"machine": "マシン",
|
||||
"learning": "ラーニング",
|
||||
"model": "モデル",
|
||||
"analysis": "アナリシス",
|
||||
"computer": "コンピューター",
|
||||
"data": "データ",
|
||||
"software": "ソフトウェア",
|
||||
"program": "プログラム",
|
||||
"system": "システム",
|
||||
"network": "ネットワーク",
|
||||
"server": "サーバー",
|
||||
"client": "クライアント",
|
||||
"database": "データベース",
|
||||
"file": "ファイル",
|
||||
"user": "ユーザー",
|
||||
"password": "パスワード",
|
||||
"login": "ログイン",
|
||||
"logout": "ログアウト",
|
||||
"website": "ウェブサイト",
|
||||
"browser": "ブラウザー",
|
||||
"application": "アプリケーション",
|
||||
"service": "サービス"
|
||||
hello: "ハロー",
|
||||
world: "ワールド",
|
||||
this: "ディス",
|
||||
is: "イズ",
|
||||
a: "ア",
|
||||
test: "テスト",
|
||||
javascript: "ジャバスクリプト",
|
||||
typescript: "タイプスクリプト",
|
||||
and: "アンド",
|
||||
api: "エーピーアイ",
|
||||
endpoint: "エンドポイント",
|
||||
machine: "マシン",
|
||||
learning: "ラーニング",
|
||||
model: "モデル",
|
||||
analysis: "アナリシス",
|
||||
computer: "コンピューター",
|
||||
data: "データ",
|
||||
software: "ソフトウェア",
|
||||
program: "プログラム",
|
||||
system: "システム",
|
||||
network: "ネットワーク",
|
||||
server: "サーバー",
|
||||
client: "クライアント",
|
||||
database: "データベース",
|
||||
file: "ファイル",
|
||||
user: "ユーザー",
|
||||
password: "パスワード",
|
||||
login: "ログイン",
|
||||
logout: "ログアウト",
|
||||
website: "ウェブサイト",
|
||||
browser: "ブラウザー",
|
||||
application: "アプリケーション",
|
||||
service: "サービス",
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert English word to Katakana using predefined mapping or phonetic approximation
|
||||
* Convert English word to Katakana using JMdict, predefined mapping, or phonetic approximation
|
||||
*/
|
||||
function convertEnglishWordToKatakana(word: string): string {
|
||||
async function convertEnglishWordToKatakana(word: string): Promise<string> {
|
||||
const lowerWord = word.toLowerCase();
|
||||
|
||||
// Check predefined mapping first
|
||||
|
||||
// First try JMdict if available
|
||||
try {
|
||||
if (isJMdictInitialized()) {
|
||||
const jmdictResult = await convertEnglishToKatakanaWithJMdict(word);
|
||||
if (jmdictResult && jmdictResult !== word) {
|
||||
return jmdictResult;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`JMdict conversion failed for "${word}":`, error);
|
||||
}
|
||||
|
||||
// Check predefined mapping second
|
||||
if (englishToKatakanaMap[lowerWord]) {
|
||||
return englishToKatakanaMap[lowerWord];
|
||||
}
|
||||
|
||||
|
||||
// Try using wanakana for romanized pronunciation
|
||||
try {
|
||||
// Convert to a rough romanized version and then to katakana
|
||||
@ -63,7 +81,7 @@ function convertEnglishWordToKatakana(word: string): string {
|
||||
} catch {
|
||||
// Fallback if wanakana fails
|
||||
}
|
||||
|
||||
|
||||
// Fallback: simple phonetic approximation
|
||||
return approximateEnglishToKatakana(word);
|
||||
}
|
||||
@ -73,35 +91,78 @@ function convertEnglishWordToKatakana(word: string): string {
|
||||
*/
|
||||
function approximateEnglishToKatakana(word: string): string {
|
||||
const phoneticMap: Record<string, string> = {
|
||||
'a': 'ア', 'b': 'ブ', 'c': 'ク', 'd': 'ド', 'e': 'エ',
|
||||
'f': 'フ', 'g': 'グ', 'h': 'ハ', 'i': 'イ', 'j': 'ジ',
|
||||
'k': 'ク', 'l': 'ル', 'm': 'ム', 'n': 'ン', 'o': 'オ',
|
||||
'p': 'プ', 'q': 'ク', 'r': 'ル', 's': 'ス', 't': 'ト',
|
||||
'u': 'ウ', 'v': 'ブ', 'w': 'ワ', 'x': 'クス', 'y': 'ワイ', 'z': 'ズ'
|
||||
a: "ア",
|
||||
b: "ブ",
|
||||
c: "ク",
|
||||
d: "ド",
|
||||
e: "エ",
|
||||
f: "フ",
|
||||
g: "グ",
|
||||
h: "ハ",
|
||||
i: "イ",
|
||||
j: "ジ",
|
||||
k: "ク",
|
||||
l: "ル",
|
||||
m: "ム",
|
||||
n: "ン",
|
||||
o: "オ",
|
||||
p: "プ",
|
||||
q: "ク",
|
||||
r: "ル",
|
||||
s: "ス",
|
||||
t: "ト",
|
||||
u: "ウ",
|
||||
v: "ブ",
|
||||
w: "ワ",
|
||||
x: "クス",
|
||||
y: "ワイ",
|
||||
z: "ズ",
|
||||
};
|
||||
|
||||
return word.toLowerCase()
|
||||
.split('')
|
||||
.map(char => phoneticMap[char] || char)
|
||||
.join('');
|
||||
|
||||
return word
|
||||
.toLowerCase()
|
||||
.split("")
|
||||
.map((char) => phoneticMap[char] || char)
|
||||
.join("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize kuroshiro with MeCab analyzer
|
||||
* Initialize kuroshiro with MeCab analyzer and JMdict
|
||||
* This should be called once during application startup
|
||||
*/
|
||||
export async function initializeTextConverter(): Promise<void> {
|
||||
if (kuroshiroInstance) {
|
||||
if (kuroshiroInstance && isJMdictInitialized()) {
|
||||
return; // Already initialized
|
||||
}
|
||||
|
||||
try {
|
||||
console.log("Kuroshiroテキストコンバーターを初期化中...");
|
||||
kuroshiroInstance = new Kuroshiro();
|
||||
await kuroshiroInstance.init(new KuroshiroAnalyzerMecab());
|
||||
console.log("Kuroshiroテキストコンバーター初期化完了");
|
||||
console.log("テキストコンバーターを初期化中...");
|
||||
|
||||
// Initialize Kuroshiro if not already done
|
||||
if (!kuroshiroInstance) {
|
||||
console.log("Kuroshiroを初期化中...");
|
||||
kuroshiroInstance = new Kuroshiro();
|
||||
await kuroshiroInstance.init(new KuroshiroAnalyzerMecab());
|
||||
console.log("Kuroshiro初期化完了");
|
||||
}
|
||||
|
||||
// Initialize JMdict if not already done
|
||||
if (!isJMdictInitialized()) {
|
||||
console.log("JMdictを初期化中...");
|
||||
await initializeJMdict();
|
||||
console.log("JMdict初期化完了");
|
||||
|
||||
const jmdictInfo = getJMdictInfo();
|
||||
if (jmdictInfo) {
|
||||
console.log(
|
||||
`JMdict情報: バージョン ${jmdictInfo.version}, 辞書日付 ${jmdictInfo.dictDate}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("テキストコンバーター初期化完了");
|
||||
} catch (error) {
|
||||
console.error("Kuroshiroの初期化に失敗しました:", error);
|
||||
console.error("テキストコンバーターの初期化に失敗しました:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
@ -155,22 +216,28 @@ export async function convertEnglishWordsOnly(text: string): Promise<string> {
|
||||
// Extract English words using regex
|
||||
const englishWordPattern = /\b[a-zA-Z]+\b/g;
|
||||
let result = text;
|
||||
|
||||
|
||||
// Find all English words
|
||||
const matches = text.match(englishWordPattern);
|
||||
|
||||
|
||||
if (matches) {
|
||||
for (const englishWord of matches) {
|
||||
try {
|
||||
// Convert each English word to katakana using our custom function
|
||||
const converted = convertEnglishWordToKatakana(englishWord);
|
||||
|
||||
const converted = await convertEnglishWordToKatakana(englishWord);
|
||||
|
||||
// Replace the English word with its katakana equivalent
|
||||
// Use word boundary to avoid partial replacements
|
||||
const wordRegex = new RegExp(`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\b`, 'g');
|
||||
const wordRegex = new RegExp(
|
||||
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
|
||||
"g",
|
||||
);
|
||||
result = result.replace(wordRegex, converted);
|
||||
} catch (convertError) {
|
||||
console.warn(`Failed to convert word "${englishWord}":`, convertError);
|
||||
console.warn(
|
||||
`Failed to convert word "${englishWord}":`,
|
||||
convertError,
|
||||
);
|
||||
// Keep original word if conversion fails
|
||||
}
|
||||
}
|
||||
@ -185,8 +252,85 @@ export async function convertEnglishWordsOnly(text: string): Promise<string> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if kuroshiro is initialized
|
||||
* Check if text converter (kuroshiro and JMdict) is fully initialized
|
||||
*/
|
||||
export function isTextConverterInitialized(): boolean {
|
||||
return kuroshiroInstance !== null && isJMdictInitialized();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if only kuroshiro is initialized (backward compatibility)
|
||||
*/
|
||||
export function isKuroshiroInitialized(): boolean {
|
||||
return kuroshiroInstance !== null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get text converter status information
|
||||
*/
|
||||
export function getTextConverterInfo(): {
|
||||
kuroshiro: boolean;
|
||||
jmdict: boolean;
|
||||
jmdictInfo: { dictDate: string; version: string } | null;
|
||||
} {
|
||||
return {
|
||||
kuroshiro: kuroshiroInstance !== null,
|
||||
jmdict: isJMdictInitialized(),
|
||||
jmdictInfo: getJMdictInfo(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert English words to Katakana using JMdict with enhanced fallback
|
||||
* This is the main function that leverages JMdict for accurate conversions
|
||||
* @param text - Input text containing English words
|
||||
* @returns Text with English words converted to Katakana using JMdict
|
||||
*/
|
||||
export async function convertEnglishToKatakanaWithJMdictFallback(
|
||||
text: string,
|
||||
): Promise<string> {
|
||||
if (!isJMdictInitialized()) {
|
||||
await initializeJMdict();
|
||||
}
|
||||
|
||||
try {
|
||||
// Extract English words using regex
|
||||
const englishWordPattern = /\b[a-zA-Z]+\b/g;
|
||||
let result = text;
|
||||
|
||||
// Find all English words
|
||||
const matches = text.match(englishWordPattern);
|
||||
|
||||
if (matches) {
|
||||
// Process each unique word to avoid duplicate conversions
|
||||
const uniqueWords = [...new Set(matches)];
|
||||
|
||||
for (const englishWord of uniqueWords) {
|
||||
try {
|
||||
// Convert using JMdict-enhanced function
|
||||
const converted =
|
||||
await convertEnglishToKatakanaWithJMdict(englishWord);
|
||||
|
||||
// Replace all occurrences of this English word with its katakana equivalent
|
||||
const wordRegex = new RegExp(
|
||||
`\\b${englishWord.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`,
|
||||
"g",
|
||||
);
|
||||
result = result.replace(wordRegex, converted);
|
||||
} catch (convertError) {
|
||||
console.warn(
|
||||
`Failed to convert word "${englishWord}":`,
|
||||
convertError,
|
||||
);
|
||||
// Keep original word if conversion fails
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error("JMdict-based English to Katakana conversion error:", error);
|
||||
// Fallback to the original method if JMdict conversion fails
|
||||
return convertEnglishWordsOnly(text);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user