Close #3
This commit is contained in:
342
services/tts.ts
342
services/tts.ts
@ -3,6 +3,94 @@ import path from "path";
|
|||||||
import ffmpegPath from "ffmpeg-static";
|
import ffmpegPath from "ffmpeg-static";
|
||||||
import { config } from "./config.js";
|
import { config } from "./config.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split text into natural chunks for TTS processing
|
||||||
|
* Aims for approximately 50 characters per chunk, breaking at natural points
|
||||||
|
*/
|
||||||
|
function splitTextIntoChunks(text: string, maxLength: number = 50): string[] {
|
||||||
|
if (text.length <= maxLength) {
|
||||||
|
return [text];
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let currentChunk = "";
|
||||||
|
|
||||||
|
// Split by sentences first (Japanese periods and line breaks)
|
||||||
|
const sentences = text.split(/([。!?\n])/);
|
||||||
|
|
||||||
|
for (let i = 0; i < sentences.length; i++) {
|
||||||
|
const sentence = sentences[i];
|
||||||
|
if (!sentence) continue;
|
||||||
|
|
||||||
|
if (currentChunk.length + sentence.length <= maxLength) {
|
||||||
|
currentChunk += sentence;
|
||||||
|
} else {
|
||||||
|
if (currentChunk.trim()) {
|
||||||
|
chunks.push(currentChunk.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
// If single sentence is too long, split further
|
||||||
|
if (sentence.length > maxLength) {
|
||||||
|
const subChunks = splitLongSentence(sentence, maxLength);
|
||||||
|
chunks.push(...subChunks);
|
||||||
|
currentChunk = "";
|
||||||
|
} else {
|
||||||
|
currentChunk = sentence;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentChunk.trim()) {
|
||||||
|
chunks.push(currentChunk.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks.filter(chunk => chunk.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split a long sentence at natural break points (commas, particles, etc.)
|
||||||
|
*/
|
||||||
|
function splitLongSentence(sentence: string, maxLength: number): string[] {
|
||||||
|
if (sentence.length <= maxLength) {
|
||||||
|
return [sentence];
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let currentChunk = "";
|
||||||
|
|
||||||
|
// Split by commas and common Japanese particles
|
||||||
|
const parts = sentence.split(/([、,,]|[はがでをにと])/);
|
||||||
|
|
||||||
|
for (const part of parts) {
|
||||||
|
if (currentChunk.length + part.length <= maxLength) {
|
||||||
|
currentChunk += part;
|
||||||
|
} else {
|
||||||
|
if (currentChunk.trim()) {
|
||||||
|
chunks.push(currentChunk.trim());
|
||||||
|
}
|
||||||
|
currentChunk = part;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentChunk.trim()) {
|
||||||
|
chunks.push(currentChunk.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still too long, force split by character limit
|
||||||
|
const finalChunks: string[] = [];
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
if (chunk.length > maxLength) {
|
||||||
|
for (let i = 0; i < chunk.length; i += maxLength) {
|
||||||
|
finalChunks.push(chunk.slice(i, i + maxLength));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
finalChunks.push(chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return finalChunks.filter(chunk => chunk.length > 0);
|
||||||
|
}
|
||||||
|
|
||||||
interface VoiceStyle {
|
interface VoiceStyle {
|
||||||
styleId: number;
|
styleId: number;
|
||||||
}
|
}
|
||||||
@ -12,6 +100,125 @@ const defaultVoiceStyle: VoiceStyle = {
|
|||||||
styleId: config.voicevox.styleId,
|
styleId: config.voicevox.styleId,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate audio for a single text chunk
|
||||||
|
*/
|
||||||
|
async function generateAudioForChunk(
|
||||||
|
chunkText: string,
|
||||||
|
chunkIndex: number,
|
||||||
|
itemId: string,
|
||||||
|
): Promise<string> {
|
||||||
|
const encodedText = encodeURIComponent(chunkText);
|
||||||
|
const queryUrl = `${config.voicevox.host}/audio_query?text=${encodedText}&speaker=${defaultVoiceStyle.styleId}`;
|
||||||
|
const synthesisUrl = `${config.voicevox.host}/synthesis?speaker=${defaultVoiceStyle.styleId}`;
|
||||||
|
|
||||||
|
console.log(`チャンク${chunkIndex + 1}の音声クエリ開始: ${itemId} (${chunkText.length}文字)`);
|
||||||
|
|
||||||
|
const queryResponse = await fetch(queryUrl, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Accept: "application/json",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!queryResponse.ok) {
|
||||||
|
const errorText = await queryResponse.text();
|
||||||
|
throw new Error(
|
||||||
|
`VOICEVOX audio query failed for chunk ${chunkIndex + 1} (${queryResponse.status}): ${errorText}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioQuery = await queryResponse.json();
|
||||||
|
|
||||||
|
console.log(`チャンク${chunkIndex + 1}の音声合成開始: ${itemId}`);
|
||||||
|
const audioResponse = await fetch(synthesisUrl, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify(audioQuery),
|
||||||
|
signal: AbortSignal.timeout(300000), // 5分のタイムアウト (チャンクごと)
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!audioResponse.ok) {
|
||||||
|
const errorText = await audioResponse.text();
|
||||||
|
throw new Error(
|
||||||
|
`VOICEVOX synthesis failed for chunk ${chunkIndex + 1} (${audioResponse.status}): ${errorText}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioArrayBuffer = await audioResponse.arrayBuffer();
|
||||||
|
const audioBuffer = Buffer.from(audioArrayBuffer);
|
||||||
|
|
||||||
|
// 出力ディレクトリの準備
|
||||||
|
const outputDir = config.paths.podcastAudioDir;
|
||||||
|
if (!fs.existsSync(outputDir)) {
|
||||||
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunkWavPath = path.resolve(outputDir, `${itemId}_chunk_${chunkIndex}.wav`);
|
||||||
|
fs.writeFileSync(chunkWavPath, audioBuffer);
|
||||||
|
|
||||||
|
console.log(`チャンク${chunkIndex + 1}のWAVファイル保存完了: ${chunkWavPath}`);
|
||||||
|
|
||||||
|
return chunkWavPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Concatenate multiple WAV files into a single MP3 file
|
||||||
|
*/
|
||||||
|
async function concatenateAudioFiles(
|
||||||
|
wavFiles: string[],
|
||||||
|
outputMp3Path: string,
|
||||||
|
): Promise<void> {
|
||||||
|
const ffmpegCmd = ffmpegPath || "ffmpeg";
|
||||||
|
|
||||||
|
// Create a temporary file list for FFmpeg concat
|
||||||
|
const tempDir = config.paths.podcastAudioDir;
|
||||||
|
const listFilePath = path.resolve(tempDir, `concat_list_${Date.now()}.txt`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Write file list in FFmpeg concat format
|
||||||
|
const fileList = wavFiles.map(file => `file '${path.resolve(file)}'`).join('\n');
|
||||||
|
fs.writeFileSync(listFilePath, fileList);
|
||||||
|
|
||||||
|
console.log(`音声ファイル結合開始: ${wavFiles.length}個のファイルを結合 -> ${outputMp3Path}`);
|
||||||
|
|
||||||
|
const result = Bun.spawnSync([
|
||||||
|
ffmpegCmd,
|
||||||
|
"-f", "concat",
|
||||||
|
"-safe", "0",
|
||||||
|
"-i", listFilePath,
|
||||||
|
"-codec:a", "libmp3lame",
|
||||||
|
"-qscale:a", "2",
|
||||||
|
"-y", // Overwrite output file
|
||||||
|
outputMp3Path,
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (result.exitCode !== 0) {
|
||||||
|
const stderr = result.stderr
|
||||||
|
? new TextDecoder().decode(result.stderr)
|
||||||
|
: "Unknown error";
|
||||||
|
throw new Error(`FFmpeg concatenation failed: ${stderr}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`音声ファイル結合完了: ${outputMp3Path}`);
|
||||||
|
} finally {
|
||||||
|
// Clean up temporary files
|
||||||
|
if (fs.existsSync(listFilePath)) {
|
||||||
|
fs.unlinkSync(listFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up individual WAV files
|
||||||
|
for (const wavFile of wavFiles) {
|
||||||
|
if (fs.existsSync(wavFile)) {
|
||||||
|
fs.unlinkSync(wavFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate TTS without adding to retry queue on failure
|
* Generate TTS without adding to retry queue on failure
|
||||||
* Used for retry queue processing to avoid infinite loops
|
* Used for retry queue processing to avoid infinite loops
|
||||||
@ -29,95 +236,80 @@ export async function generateTTSWithoutQueue(
|
|||||||
throw new Error("Script text is required for TTS generation");
|
throw new Error("Script text is required for TTS generation");
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`TTS生成開始: ${itemId} (試行回数: ${retryCount + 1})`);
|
console.log(`TTS生成開始: ${itemId} (試行回数: ${retryCount + 1}, ${scriptText.length}文字)`);
|
||||||
const encodedText = encodeURIComponent(scriptText);
|
|
||||||
|
|
||||||
const queryUrl = `${config.voicevox.host}/audio_query?text=${encodedText}&speaker=${defaultVoiceStyle.styleId}`;
|
// Split text into chunks
|
||||||
const synthesisUrl = `${config.voicevox.host}/synthesis?speaker=${defaultVoiceStyle.styleId}`;
|
const chunks = splitTextIntoChunks(scriptText.trim());
|
||||||
|
console.log(`テキストを${chunks.length}個のチャンクに分割: ${itemId}`);
|
||||||
const queryResponse = await fetch(queryUrl, {
|
|
||||||
method: "POST",
|
if (chunks.length === 0) {
|
||||||
headers: {
|
throw new Error("No valid text chunks generated");
|
||||||
"Content-Type": "application/json",
|
|
||||||
Accept: "application/json",
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!queryResponse.ok) {
|
|
||||||
const errorText = await queryResponse.text();
|
|
||||||
throw new Error(
|
|
||||||
`VOICEVOX audio query failed (${queryResponse.status}): ${errorText}`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const audioQuery = await queryResponse.json();
|
|
||||||
|
|
||||||
console.log(`音声合成開始: ${itemId}`);
|
|
||||||
const audioResponse = await fetch(synthesisUrl, {
|
|
||||||
method: "POST",
|
|
||||||
headers: {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
body: JSON.stringify(audioQuery),
|
|
||||||
signal: AbortSignal.timeout(600000), // 10分のタイムアウト
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!audioResponse.ok) {
|
|
||||||
const errorText = await audioResponse.text();
|
|
||||||
console.error(`音声合成失敗: ${itemId}`);
|
|
||||||
throw new Error(
|
|
||||||
`VOICEVOX synthesis failed (${audioResponse.status}): ${errorText}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const audioArrayBuffer = await audioResponse.arrayBuffer();
|
|
||||||
const audioBuffer = Buffer.from(audioArrayBuffer);
|
|
||||||
|
|
||||||
// 出力ディレクトリの準備
|
|
||||||
const outputDir = config.paths.podcastAudioDir;
|
const outputDir = config.paths.podcastAudioDir;
|
||||||
if (!fs.existsSync(outputDir)) {
|
if (!fs.existsSync(outputDir)) {
|
||||||
fs.mkdirSync(outputDir, { recursive: true });
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
const wavFilePath = path.resolve(outputDir, `${itemId}.wav`);
|
|
||||||
const mp3FilePath = path.resolve(outputDir, `${itemId}.mp3`);
|
const mp3FilePath = path.resolve(outputDir, `${itemId}.mp3`);
|
||||||
|
const generatedWavFiles: string[] = [];
|
||||||
|
|
||||||
console.log(`WAVファイル保存開始: ${wavFilePath}`);
|
try {
|
||||||
fs.writeFileSync(wavFilePath, audioBuffer);
|
// Generate audio for each chunk
|
||||||
console.log(`WAVファイル保存完了: ${wavFilePath}`);
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
const chunk = chunks[i];
|
||||||
|
if (!chunk) continue;
|
||||||
|
console.log(`チャンク${i + 1}/${chunks.length}処理中: "${chunk.substring(0, 30)}${chunk.length > 30 ? '...' : ''}"`);
|
||||||
|
|
||||||
|
const wavPath = await generateAudioForChunk(chunk, i, itemId);
|
||||||
|
generatedWavFiles.push(wavPath);
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`MP3変換開始: ${wavFilePath} -> ${mp3FilePath}`);
|
// Concatenate all audio files
|
||||||
|
if (generatedWavFiles.length === 1) {
|
||||||
|
// Single chunk - just convert to MP3
|
||||||
|
const ffmpegCmd = ffmpegPath || "ffmpeg";
|
||||||
|
const firstWavFile = generatedWavFiles[0];
|
||||||
|
if (!firstWavFile) {
|
||||||
|
throw new Error("No WAV files generated");
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = Bun.spawnSync([
|
||||||
|
ffmpegCmd,
|
||||||
|
"-i", firstWavFile,
|
||||||
|
"-codec:a", "libmp3lame",
|
||||||
|
"-qscale:a", "2",
|
||||||
|
"-y",
|
||||||
|
mp3FilePath,
|
||||||
|
]);
|
||||||
|
|
||||||
const ffmpegCmd = ffmpegPath || "ffmpeg";
|
if (result.exitCode !== 0) {
|
||||||
const result = Bun.spawnSync({
|
const stderr = result.stderr
|
||||||
cmd: [
|
? new TextDecoder().decode(result.stderr)
|
||||||
ffmpegCmd,
|
: "Unknown error";
|
||||||
"-i",
|
throw new Error(`FFmpeg conversion failed: ${stderr}`);
|
||||||
wavFilePath,
|
}
|
||||||
"-codec:a",
|
|
||||||
"libmp3lame",
|
// Clean up WAV file
|
||||||
"-qscale:a",
|
fs.unlinkSync(firstWavFile);
|
||||||
"2",
|
} else {
|
||||||
"-y", // Overwrite output file
|
// Multiple chunks - concatenate them
|
||||||
mp3FilePath,
|
await concatenateAudioFiles(generatedWavFiles, mp3FilePath);
|
||||||
],
|
}
|
||||||
});
|
|
||||||
|
|
||||||
if (result.exitCode !== 0) {
|
console.log(`TTS生成完了: ${itemId} (${chunks.length}チャンク)`);
|
||||||
const stderr = result.stderr
|
return path.basename(mp3FilePath);
|
||||||
? new TextDecoder().decode(result.stderr)
|
|
||||||
: "Unknown error";
|
} catch (error) {
|
||||||
throw new Error(`FFmpeg conversion failed: ${stderr}`);
|
// Clean up any generated files on error
|
||||||
|
for (const wavFile of generatedWavFiles) {
|
||||||
|
if (fs.existsSync(wavFile)) {
|
||||||
|
fs.unlinkSync(wavFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wavファイルを削除
|
|
||||||
if (fs.existsSync(wavFilePath)) {
|
|
||||||
fs.unlinkSync(wavFilePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`TTS生成完了: ${itemId}`);
|
|
||||||
|
|
||||||
return path.basename(mp3FilePath);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function generateTTS(
|
export async function generateTTS(
|
||||||
|
Reference in New Issue
Block a user