diff --git a/.gitignore b/.gitignore index 074fc6f..e6a6396 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ # Build output -dist +dist/ # General files node_modules @@ -28,4 +28,7 @@ package-lock.json example_audio.webm example_audio_pitched.webm -msedgetts-test/ \ No newline at end of file +# Generated test files and AI-generated content +msedgetts-test/ +.sisyphus/ +.github/ \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..30d1939 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,184 @@ +# PROJECT KNOWLEDGE BASE + +**Generated:** 2026-03-22 +**Commit:** main branch +**Branch:** main + +## OVERVIEW + +Microsoft Edge TTS Text-to-Speech Library - A Node.js/TypeScript module using Azure Speech Service (Microsoft Edge Read Aloud API). Supports speech synthesis, SSML, multi-speaker dialogue, emotional style control, and multiple audio format output. + +**Core Stack**: TypeScript, WebSocket, Jest (testing), pnpm (package manager) +**Code Size**: ~1010 lines of TypeScript (src/ directory) +**Last Updated**: 2026-03-22 + +## STRUCTURE + +``` +./ +├── src/ # All source code (9 TypeScript files) +│ ├── index.ts # Main entry point (barrel exports, 6 exports) +│ ├── MsEdgeTTS.ts # Core TTS class (~499 lines, WebSocket communication) +│ ├── MsEdgeTTS.spec.ts # Unit tests +│ ├── Output.ts # Audio output format enum + extension mapping +│ ├── Prosody.ts # Rate/pitch/volume options class +│ ├── DialogueTurn.ts # Dialogue turn type definition +│ ├── DialogueBuilder.ts # Dialogue builder class + SSML builder function +│ ├── SSMLUtils.ts # SSML utility functions (escape, validate) +│ └── utils.ts # Path joining utility +├── example/ # Example demo code (6 Chinese-named files) +│ ├── 00-简单对话演示.ts +│ ├── 01-多说话人对话 - 链式调用.ts +│ ├── 02-多说话人对话 - 函数式.ts +│ ├── 03-31 种情感风格演示.ts +│ ├── 04-情感强度控制演示.ts +│ └── 05-文本替换功能演示.ts +├── .github/workflows/ +│ └── deploy_docs.yml # CI/CD: Documentation deployment to gh-pages only +├── docs/ # Manually written SSML documentation +├── package.json # Dependencies + Jest config (inline) +├── tsconfig.json # TypeScript compilation configuration +└── README.md # API documentation +``` + +## WHERE TO LOOK + +| Task | Location | Description | +|------|------|------| +| Add new feature | `src/` | Create `.ts` file at same level | +| Modify core logic | `src/MsEdgeTTS.ts` | WebSocket communication, SSML processing | +| Add audio format | `src/Output.ts` | `OUTPUT_FORMAT` enum | +| Modify voice options | `src/Prosody.ts` | `ProsodyOptions` class | +| Add tests | `src/*.spec.ts` | Tests in same directory as source | +| Modify CI/CD | `.github/workflows/` | Documentation deployment flow only | +| Configure Jest | `package.json` | Jest config inline in package.json | + +## CODE MAP + +| Symbol | Type | Location | Role | +|--------|------|----------|------| +| `MsEdgeTTS` | Class | `src/MsEdgeTTS.ts` | Main class: WebSocket connection, speech synthesis | +| `OUTPUT_FORMAT` | Enum | `src/Output.ts` | Supported audio output formats (MP3, WEBM) | +| `OUTPUT_EXTENSIONS` | Const | `src/Output.ts` | Format to file extension mapping | +| `ProsodyOptions` | Class | `src/Prosody.ts` | Rate/pitch/volume configuration options | +| `RATE` | Enum | `src/Prosody.ts` | Speaking rate presets (x-slow to x-fast) | +| `PITCH` | Enum | `src/Prosody.ts` | Pitch presets (x-low to x-high) | +| `VOLUME` | Enum | `src/Prosody.ts` | Volume presets (silent to x-LOUD) | +| `Voice` | Type | `src/MsEdgeTTS.ts` | Voice metadata structure | +| `MetadataOptions` | Class | `src/MsEdgeTTS.ts` | Boundary metadata options (sentence/word) | +| `DialogueBuilder` | Class | `src/DialogueBuilder.ts` | Chained dialogue builder | +| `buildDialogueSSML` | Function | `src/DialogueBuilder.ts` | Functional SSML generation | +| `escapeSSML` | Function | `src/SSMLUtils.ts` | XML escape (& < > " ') | +| `validateStyle` | Function | `src/SSMLUtils.ts` | Validate 28 official emotional styles | +| `validateStyleDegree` | Function | `src/SSMLUtils.ts` | Validate styleDegree range (0.01-2.0) | +| `joinPath` | Function | `src/utils.ts` | Path joining utility | + +## CONVENTIONS + +**TypeScript Configuration**: +- `target`: ESNext +- `module`: CommonJS +- `outDir`: dist/ +- Skip library check (skipLibCheck: true) + +**Testing Conventions**: +- Test files: `*.spec.ts` in same directory as source +- Jest config inline in `package.json` +- Test timeout: 15 seconds + +**Package Manager**: +- pnpm required (preinstall hook) +- Version lock: pnpm-lock.yaml + +**Error Handling Conventions**: +- Throw clear Error on validation failure (see SSMLUtils.ts) +- Invalid input throws immediately, no fallback + +**Logging Conventions**: +- Optional logger via `enableLogger` option +- Private `_log()` method for logging +- Log only connection status, message exchange + +**SSML Processing Conventions**: +- Escape & first, then others, to prevent double escaping +- Only `speak`, `voice`, `prosody` elements supported + +## ANTI-PATTERNS (THIS PROJECT) + +- ❌ **Do NOT** use npm/yarn - project requires pnpm +- ❌ **Do NOT** move tests to separate directory - keep `*.spec.ts` alongside source +- ❌ **Do NOT** modify tsconfig module/moduleResolution - depends on CommonJS +- ❌ **Do NOT** modify Sec-MS-GEC hash algorithm - depends on Azure authentication +- ❌ **Do NOT** remove `isomorphic-ws` dependency - enables cross-environment compatibility +- ❌ **Do NOT** use callback API - Promise only +- ❌ **Do NOT** use in browser - API requires Edge User-Agent (server-side only) +- ❌ **Do NOT** delete files outside `dist/` - publish includes only dist directory + +## ERROR HANDLING + +**Error Throwing Scenarios**: +- Metadata not configured: `"Speech synthesis not configured yet..."` +- Invalid voiceLocale: `"Could not infer voiceLocale from voiceName..."` +- Invalid style: `'Invalid style "xxx". Valid styles: ...'` +- styleDegree out of range: `"styleDegree must be between 0.01 and 2.0"` +- Empty voice name: `"voice name is required and cannot be empty"` +- Empty text: `"text cannot be empty string"` + +## UNIQUE STYLES + +**SSML Template**: +- Default template: `` → `` → `` +- Only `speak`, `voice`, `prosody` elements supported +- Full SSML not supported + +**WebSocket Communication**: +- Uses `isomorphic-ws` for browser/Node compatibility +- Custom UUID generation (not crypto.randomUUID) +- Sec-MS-GEC hash authentication mechanism + +**Logging System**: +- Optional logger (enableLogger option) +- Logs only connection status, message exchange + +## COMMANDS + +```bash +# Install dependencies +pnpm install + +# Development (build + run tests) +pnpm run dev + +# Compile TypeScript +pnpm run build + +# Run tests +pnpm test + +# Tests (watch mode) +pnpm run test:watch + +# Tests (coverage) +pnpm run test:cov + +# Publish to npm +pnpm run publish +``` + +## NOTES + +**Key Limitations**: +- December 2025 update: API requires Edge User-Agent, **cannot be used in browsers** +- Promise API only, no callback support +- Voice list requires trusted client Token (hardcoded in source) + +**Known Issues**: +- `src/test/test.ts` and `src/test/jest-e2e.json` in package.json do not exist (legacy config) +- Insufficient test coverage: only 1 test file (MsEdgeTTS.spec.ts), 11% coverage +- utils.ts is too simplified (only 6 lines), could be merged +- example/ directory mixes non-TS files (config.json, run.sh, etc.) + +**Publish Flow**: +1. `pnpm run build` compiles to dist/ +2. `pnpm publish --access=public` +3. Documentation auto-deploys to gh-pages (via GitHub Actions) diff --git a/README.md b/README.md index b55c3be..c6c416d 100644 --- a/README.md +++ b/README.md @@ -134,3 +134,870 @@ import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts"; For the full documentation check out the [API Documentation](https://migushthe2nd.github.io/MsEdgeTTS). This library only supports promises. + +## Multi-Speaker Dialogue + +Supports multi-speaker dialogue synthesis, making it easy to create audio content containing multiple voice characters. + +### Simple Example (Functional) + +Quickly build dialogue using the `buildDialogueSSML()` utility function: + +```js +import {MsEdgeTTS, OUTPUT_FORMAT, buildDialogueSSML} from "msedge-tts"; + +(async () => { + const tts = new MsEdgeTTS(); + await tts.setMetadata("zh-CN-XiaoxiaoNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); + + const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Hello", style: "cheerful" }, + { voice: "en-US-AndrewNeural", text: "Hello", lang: "en-US" } + ]); + + const {audioStream} = await tts.toStream(ssml); + + audioStream.on("data", (data) => { + console.log("DATA RECEIVED", data); + }); +})(); +``` + +### Chained Call Example + +Build dialogue in a chained manner using the `DialogueBuilder` class: + +```js +import {MsEdgeTTS, OUTPUT_FORMAT, DialogueBuilder} from "msedge-tts"; + +(async () => { + const tts = new MsEdgeTTS(); + await tts.setMetadata("zh-CN-XiaoxiaoNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); + + const dialogue = new DialogueBuilder() + .addTurn({ voice: "zh-CN-XiaoxiaoNeural", text: "Hello everyone!" }) + .addTurn({ voice: "en-US-AndrewNeural", text: "Hi everyone!" }) + .build(); + + const {audioStream} = await tts.toStreamDialogue(dialogue); + + audioStream.on("data", (data) => { + console.log("DATA RECEIVED", data); + }); +})(); +``` + +### Chinese-English Mixed Example + +Supports mixing multiple languages within the same dialogue: + +```js +import {MsEdgeTTS, OUTPUT_FORMAT, buildDialogueSSML} from "msedge-tts"; + +(async () => { + const tts = new MsEdgeTTS(); + await tts.setMetadata("zh-CN-XiaoxiaoNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); + + const ssml = buildDialogueSSML([ + { + voice: "zh-CN-XiaoxiaoNeural", + text: "Welcome to our meeting", + style: "friendly" + }, + { + voice: "en-US-AndrewNeural", + text: "Welcome to our conference", + style: "friendly", + lang: "en-US" + }, + { + voice: "zh-CN-YunxiNeural", + text: "Today we will discuss the future of artificial intelligence", + style: "documentary-narration" + } + ]); + + const {audioStream} = await tts.toStream(ssml); + + audioStream.on("data", (data) => { + console.log("DATA RECEIVED", data); + }); +})(); +``` + +### Supported Emotional Styles + +Microsoft Azure Speech Service officially supports the following 28 emotional styles: + +| Style | Description | +| --- | --- | +| `advertisement_upbeat` | Promote products or services with an excited and energetic tone | +| `affectionate` | Express warm and affectionate tone with higher pitch and volume | +| `angry` | Express angry and disgusted tone | +| `assistant` | Speak in a warm and relaxed tone, used for digital assistants | +| `calm` | Speak with composure and calmness | +| `chat` | Express a relaxed and casual tone | +| `cheerful` | Express a positive and pleasant tone | +| `customerservice` | Provide support to customers with a friendly and enthusiastic tone | +| `depressed` | Express melancholy and depressed tone with lower pitch and volume | +| `documentary-narration` | Narrate documentaries in a relaxed, interested, and informative style | +| `empathetic` | Express care and understanding | +| `excited` | Express an optimistic and hopeful tone | +| `fearful` | Express fear with higher pitch, higher volume, and faster speech rate | +| `friendly` | Express a pleasant, charming, and warm tone | +| `gentle` | Express a mild, polite, and pleasant tone with lower pitch and volume | +| `hopeful` | Speak in a warm and longing tone | +| `lyrical` | Express emotions in a graceful and slightly sentimental way | +| `narration-professional` | Read content in a professional and objective tone | +| `narration-relaxed` | Speak in a soothing and pleasant tone, used for content narration | +| `newscast` | Narrate news in a formal and professional tone | +| `newscast-casual` | Deliver general news in a common, casual tone | +| `newscast-formal` | Deliver news in a formal, confident, and authoritative tone | +| `poetry-reading` | Express emotional and rhythmic tone when reading poetry | +| `sad` | Express a sorrowful tone | +| `serious` | Express a serious and commanding tone | +| `shouting` | Sound as if speaking from a distance or in another location | +| `sports_commentary` | Express a relaxed yet interested tone for broadcasting sports events | +| `sports_commentary_excited` | Broadcast sports event highlights with a fast and energetic tone | +| `terrified` | Express a fearful tone with fast speech rate and trembling voice | +| `unfriendly` | Express a cold and indifferent tone | +| `whispering` | Speak in a soft tone trying to produce a gentle and mild sound | + +### Using Style Degree + +You can adjust the emotional intensity through the `styleDegree` parameter (range: 0.01 to 2.0, default is 1): + +```js +const ssml = buildDialogueSSML([ + { + voice: "zh-CN-XiaomoNeural", + text: "Hurry up, be careful on the road", + style: "sad", + styleDegree: 2.0 // Stronger sadness emotion + } +]); +``` + +For more detailed information, please refer to the [Microsoft official documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-voice). + +--- + +## Complete API Reference + +### Class: `MsEdgeTTS` + +Main TTS class for speech synthesis via WebSocket. + +#### Constructor + +```ts +new MsEdgeTTS(options?: Options) +``` + +**Options:** +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `agent` | `Agent` | `undefined` | Custom HTTP agent (proxy support, **not supported in browser**) | +| `enableLogger` | `boolean` | `false` | Enable built-in logger for connection status | + +#### Methods + +##### `getVoices(): Promise` + +Fetch the list of voices available in Microsoft Edge. + +**Returns:** Array of voice objects with properties: +- `Name`: Full voice name +- `ShortName`: Short identifier (e.g., `"en-US-AriaNeural"`) +- `Gender`: `"Male"` or `"Female"` +- `Locale`: Voice locale (e.g., `"en-US"`) +- `SuggestedCodec`: Recommended codec +- `FriendlyName`: Display name +- `Status`: Voice status + +**Example:** +```ts +const tts = new MsEdgeTTS(); +const voices = await tts.getVoices(); +console.log(voices.filter(v => v.Gender === "Female")); +``` + +--- + +##### `setMetadata(voiceName, outputFormat, metadataOptions?): Promise` + +Initialize speech synthesis parameters. **Must be called before `toStream` or `toFile`.** + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `voiceName` | `string` | ✅ | Voice ShortName (e.g., `"en-US-AriaNeural"`) | +| `outputFormat` | `OUTPUT_FORMAT` | ✅ | Audio output format | +| `metadataOptions` | `MetadataOptions` | ❌ | Boundary metadata options | + +**MetadataOptions:** +| Property | Type | Default | Description | +|----------|------|---------|-------------| +| `voiceLocale` | `string` | Auto-inferred | Voice locale override | +| `sentenceBoundaryEnabled` | `boolean` | `false` | Enable sentence boundary metadata | +| `wordBoundaryEnabled` | `boolean` | `false` | Enable word boundary metadata | + +**Example:** +```ts +await tts.setMetadata( + "en-US-AriaNeural", + OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS, + { wordBoundaryEnabled: true, sentenceBoundaryEnabled: true } +); +``` + +--- + +##### `toStream(input, options?): { audioStream: Readable, metadataStream: Readable | null }` + +Synthesize text to audio stream (real-time). + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `input` | `string` | ✅ | Text or SSML to synthesize | +| `options` | `ProsodyOptions` | ❌ | Voice prosody settings | + +**Returns:** +- `audioStream`: Node.js Readable stream with raw audio data +- `metadataStream`: Readable stream with boundary metadata (if enabled) + +**Example:** +```ts +const { audioStream, metadataStream } = await tts.toStream("Hello world", { + rate: RATE.FAST, + pitch: "+10Hz", + volume: VOLUME.LOUD +}); + +audioStream.on("data", (data) => { + console.log("Audio chunk:", data); +}); + +metadataStream?.on("data", (chunk) => { + const metadata = JSON.parse(chunk.toString()); + console.log("Metadata:", metadata); +}); +``` + +--- + +##### `toFile(dirPath, input, options?): Promise<{ audioFilePath: string, metadataFilePath: string | null }>` + +Synthesize text and save to file. + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `dirPath` | `string` | ✅ | Output directory path | +| `input` | `string` | ✅ | Text or SSML to synthesize | +| `options` | `ProsodyOptions` | ❌ | Voice prosody settings | + +**Example:** +```ts +const { audioFilePath, metadataFilePath } = await tts.toFile( + "./output", + "Hello world", + { rate: 0.8 } +); +console.log("Saved to:", audioFilePath); +``` + +--- + +##### `toStreamDialogue(dialogue): { audioStream: Readable, metadataStream: Readable | null }` + +Synthesize multi-speaker dialogue to stream. + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `dialogue` | `Dialogue | DialogueTurn[]` | ✅ | Dialogue object or array of turns | + +**Example:** +```ts +const dialogue = new DialogueBuilder() + .addTurn({ voice: "zh-CN-XiaoxiaoNeural", text: "你好", style: "cheerful" }) + .addTurn({ voice: "en-US-AndrewNeural", text: "Hello", lang: "en-US" }) + .build(); + +const { audioStream } = await tts.toStreamDialogue(dialogue); +``` + +--- + +##### `toFileDialogue(dirPath, dialogue, options?): Promise<{ audioFilePath: string, metadataFilePath: string | null }>` + +Synthesize multi-speaker dialogue and save to file. + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `dirPath` | `string` | ✅ | Output directory path | +| `dialogue` | `Dialogue | DialogueTurn[]` | ✅ | Dialogue object or array of turns | +| `options` | `ProsodyOptions` | ❌ | Global prosody settings | + +--- + +##### `rawToStream(requestSSML): { audioStream: Readable, metadataStream: Readable | null }` + +Synthesize custom SSML to stream (no template applied). + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `requestSSML` | `string` | ✅ | Complete SSML string | + +**Example:** +```ts +const customSSML = ` + + + + Hello world + + +`; + +const { audioStream } = await tts.rawToStream(customSSML); +``` + +--- + +##### `rawToFile(dirPath, requestSSML): Promise<{ audioFilePath: string, metadataFilePath: string | null }>` + +Synthesize custom SSML and save to file. + +--- + +##### `close(): void` + +Close the WebSocket connection. + +--- + +### Enum: `OUTPUT_FORMAT` + +Supported audio output formats. + +| Format | Codec | Bitrate | Extension | Use Case | +|--------|-------|---------|-----------|----------| +| `AUDIO_24KHZ_48KBITRATE_MONO_MP3` | MP3 | 48 kbps | `.mp3` | Standard quality | +| `AUDIO_24KHZ_96KBITRATE_MONO_MP3` | MP3 | 96 kbps | `.mp3` | High quality | +| `WEBM_24KHZ_16BIT_MONO_OPUS` | OPUS | ~64 kbps | `.webm` | Web streaming | + +**Usage:** +```ts +import { OUTPUT_FORMAT } from "msedge-tts"; + +await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); +``` + +--- + +### Class: `ProsodyOptions` + +Voice prosody configuration. + +```ts +class ProsodyOptions { + pitch?: PITCH | string = "+0Hz" + rate?: RATE | string | number = 1.0 + volume?: VOLUME | string | number = 100.0 +} +``` + +#### Properties + +##### `pitch` + +Baseline pitch for the voice. + +**Accepted values:** +- `PITCH` enum: `X_LOW`, `LOW`, `MEDIUM`, `HIGH`, `X_HIGH`, `DEFAULT` +- Relative frequency: `"+50Hz"`, `"-20Hz"` +- Relative semitones: `"+2st"`, `"-3st"` +- Relative percentage: `"+50%"`, `"-25%"` + +**Default:** `"+0Hz"` + +--- + +##### `rate` + +Speaking rate for the voice. + +**Accepted values:** +- `RATE` enum: `X_SLOW`, `SLOW`, `MEDIUM`, `FAST`, `X_FAST`, `DEFAULT` +- Relative number: `0.5` (50%), `2.0` (200%) +- Relative percentage string: `"+50%"`, `"-25%"` + +**Default:** `1.0` (normal speed) + +--- + +##### `volume` + +Volume level for the voice. + +**Accepted values:** +- `VOLUME` enum: `SILENT`, `X_SOFT`, `SOFT`, `MEDIUM`, `LOUD`, `X_LOUD`, `DEFAULT` +- Absolute number: `0` to `100` +- Relative number: `"+10"`, `"-20"` +- Relative percentage: `"+50%"`, `"-30%"` + +**Default:** `100.0` + +--- + +### Enum: `RATE` + +Speaking rate presets. + +| Value | Description | +|-------|-------------| +| `X_SLOW` | Extra slow (0.3x) | +| `SLOW` | Slow (0.5x) | +| `MEDIUM` | Medium (0.8x) | +| `DEFAULT` | Normal (1.0x) | +| `FAST` | Fast (1.5x) | +| `X_FAST` | Extra fast (2.0x) | + +--- + +### Enum: `PITCH` + +Pitch presets. + +| Value | Description | +|-------|-------------| +| `X_LOW` | Extra low | +| `LOW` | Low | +| `MEDIUM` | Medium | +| `DEFAULT` | Normal | +| `HIGH` | High | +| `X_HIGH` | Extra high | + +--- + +### Enum: `VOLUME` + +Volume presets. + +| Value | Description | +|-------|-------------| +| `SILENT` | Silent | +| `X_SOFT` | Extra soft | +| `SOFT` | Soft | +| `MEDIUM` | Medium | +| `LOUD` | Loud | +| `X_LOUD` | Extra loud | + +--- + +### Interface: `DialogueTurn` + +Single speaker turn in a multi-speaker dialogue. + +```ts +interface DialogueTurn { + speaker?: string // Optional speaker name + voice: string // Voice ShortName (required) + text?: string // Text content + children?: TextSegment[] // Child text segments + style?: string // Emotional style + styleDegree?: number // Style intensity (0.01-2.0) + lang?: string // Language override (e.g., "en-US") + substitutions?: Substitution[] // Text replacements +} +``` + +--- + +### Interface: `TextSegment` + +Text segment with language or substitution. + +```ts +interface TextSegment { + text: string + lang?: string // Language for this segment + substitution?: string // Custom SSML substitution +} +``` + +--- + +### Interface: `Substitution` + +Text substitution for pronunciation. + +```ts +interface Substitution { + text: string // Text to replace + alias: string // Replacement text (or pronunciation) +} +``` + +**Example:** +```ts +{ + text: "W3C", + alias: "World Wide Web Consortium" +} +``` + +--- + +### Class: `DialogueBuilder` + +Chainable builder for multi-speaker dialogues. + +```ts +class DialogueBuilder { + constructor() + addTurn(turn: DialogueTurn): DialogueBuilder + build(): Dialogue + reset(): DialogueBuilder +} +``` + +**Example:** +```ts +const dialogue = new DialogueBuilder() + .addTurn({ voice: "zh-CN-XiaoxiaoNeural", text: "你好", style: "friendly" }) + .addTurn({ voice: "en-US-AndrewNeural", text: "Hello", lang: "en-US" }) + .build(); +``` + +--- + +### Function: `buildDialogueSSML(turns: DialogueTurn[]): string` + +Functional API to build SSML from dialogue turns. + +**Parameters:** +| Parameter | Type | Description | +|-----------|------|-------------| +| `turns` | `DialogueTurn[]` | Array of dialogue turns | + +**Returns:** Complete SSML string + +**Example:** +```ts +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "你好" }, + { voice: "en-US-AndrewNeural", text: "Hello", lang: "en-US" } +]); +``` + +--- + +### Function: `escapeSSML(text: string): string` + +Escape special XML characters in text. + +**Escapes:** +- `&` → `&` +- `<` → `<` +- `>` → `>` +- `"` → `"` +- `'` → `'` + +**Example:** +```ts +escapeSSML("Tom & Jerry ") +// Returns: "Tom & Jerry <Cat>" +``` + +--- + +### Function: `validateStyle(style: string): void` + +Validate emotional style name. Throws `Error` if invalid. + +**Valid styles:** All 28 Microsoft official styles (see table above) + +--- + +### Function: `validateStyleDegree(degree: number): void` + +Validate style intensity range. Throws `Error` if outside 0.01-2.0. + +--- + +## Error Handling + +### Common Errors + +#### 1. Metadata Not Configured + +```ts +// ❌ Wrong: Calling toStream without setMetadata +const { audioStream } = await tts.toStream("Hello"); +// Throws: "Speech synthesis not configured yet..." + +// ✅ Correct: +await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); +const { audioStream } = await tts.toStream("Hello"); +``` + +--- + +#### 2. Invalid Voice Name + +```ts +// ❌ Wrong: Invalid voice name +await tts.setMetadata("invalid-voice", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); +// May throw: "Could not infer voiceLocale from voiceName..." + +// ✅ Correct: Use valid ShortName from getVoices() +const voices = await tts.getVoices(); +const validVoice = voices[0].ShortName; +await tts.setMetadata(validVoice, OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); +``` + +--- + +#### 3. Invalid Style Name + +```ts +import { buildDialogueSSML } from "msedge-tts"; + +// ❌ Wrong: Invalid style +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Hello", style: "invalid-style" } +]); +// Throws: 'Invalid style "invalid-style". Valid styles: ...' + +// ✅ Correct: Use valid style +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Hello", style: "cheerful" } +]); +``` + +--- + +#### 4. Invalid styleDegree Range + +```ts +// ❌ Wrong: Out of range +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Hello", style: "sad", styleDegree: 5.0 } +]); +// Throws: 'styleDegree must be between 0.01 and 2.0' + +// ✅ Correct: Within range 0.01-2.0 +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Hello", style: "sad", styleDegree: 1.5 } +]); +``` + +--- + +#### 5. No Audio Data Received + +```ts +// May occur if: +// - Network connection lost +// - Invalid SSML syntax +// - Voice service unavailable + +try { + await tts.toFile("./output", "Hello"); +} catch (error) { + console.error("Generation failed:", error.message); + // Handle: "No audio data received" +} +``` + +--- + +## Performance Optimization + +### 1. Reuse MsEdgeTTS Instance + +```ts +// ❌ Inefficient: Create new instance for each request +for (const text of texts) { + const tts = new MsEdgeTTS(); + await tts.setMetadata(voice, format); + await tts.toFile(`./output/${i}.mp3`, text); +} + +// ✅ Efficient: Reuse instance +const tts = new MsEdgeTTS(); +await tts.setMetadata(voice, format); +for (const text of texts) { + await tts.toFile(`./output/${i}.mp3`, text); +} +``` + +--- + +### 2. Batch Dialogue Turns + +```ts +// ❌ Inefficient: Separate requests +await tts.toFile("./output/1.mp3", buildDialogueSSML([turn1])); +await tts.toFile("./output/2.mp3", buildDialogueSSML([turn2])); + +// ✅ Efficient: Single request +await tts.toFile("./output/combined.mp3", buildDialogueSSML([turn1, turn2, turn3])); +``` + +--- + +### 3. Use Appropriate Bitrate + +| Use Case | Recommended Format | +|----------|-------------------| +| Podcast/Audiobook | `AUDIO_24KHZ_96KBITRATE_MONO_MP3` | +| Voice Assistant | `AUDIO_24KHZ_48KBITRATE_MONO_MP3` | +| Web Streaming | `WEBM_24KHZ_16BIT_MONO_OPUS` | + +--- + +### 4. Enable Logger for Debugging + +```ts +const tts = new MsEdgeTTS({ enableLogger: true }); +// Logs: connection status, message exchange, disconnection +``` + +--- + +## FAQ + +### Q: Can I use this library in the browser? + +**A:** No. As of December 2025, the API requires a Microsoft Edge User-Agent, which browsers other than Edge cannot provide. Use this library in server-side Node.js environments only. + +--- + +### Q: How do I get a list of all available voices? + +**A:** Use the `getVoices()` method: + +```ts +const tts = new MsEdgeTTS(); +const voices = await tts.getVoices(); +console.log(voices.map(v => ({ name: v.ShortName, gender: v.Gender, locale: v.Locale }))); +``` + +--- + +### Q: Can I mix multiple languages in one dialogue? + +**A:** Yes! Use the `lang` property in `DialogueTurn`: + +```ts +const ssml = buildDialogueSSML([ + { voice: "zh-CN-XiaoxiaoNeural", text: "Welcome to our meeting", style: "friendly" }, + { voice: "zh-CN-XiaoxiaoNeural", text: "欢迎参加我们的会议", lang: "zh-CN" }, + { voice: "en-US-AndrewNeural", text: "Today we will discuss AI", lang: "en-US" } +]); +``` + +--- + +### Q: How do I change the speaking speed? + +**A:** Use the `rate` option: + +```ts +// Using preset +await tts.toStream("Hello", { rate: RATE.FAST }); + +// Using custom value (0.5 = 50% speed, 2.0 = 200% speed) +await tts.toStream("Hello", { rate: 0.75 }); + +// Using percentage string +await tts.toStream("Hello", { rate: "+50%" }); // 150% speed +``` + +--- + +### Q: What is the maximum text length for synthesis? + +**A:** Microsoft Azure Speech Service has a limit of approximately 1000 characters per request. For longer texts: +1. Split into multiple requests +2. Use `DialogueBuilder` to chain segments +3. Concatenate audio files post-synthesis + +--- + +### Q: How do I use a proxy? + +**A:** Pass a custom HTTP agent: + +```ts +import { SocksProxyAgent } from 'socks-proxy-agent'; +import { MsEdgeTTS } from "msedge-tts"; + +const agent = new SocksProxyAgent("socks://user:pass@proxy-host:port"); +const tts = new MsEdgeTTS({ agent }); +await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS); +``` + +--- + +### Q: Why am I getting "No audio data received"? + +**A:** Common causes: +1. **Network issues**: Check internet connection +2. **Invalid SSML**: Verify SSML syntax +3. **Voice service down**: Try a different voice +4. **Rate limiting**: Wait and retry + +--- + +## Changelog + +### Version 2.0.4 (Current) + +**Features:** +- ✅ Multi-speaker dialogue support (`DialogueBuilder`, `buildDialogueSSML`) +- ✅ 28 emotional styles with intensity control (0.01-2.0) +- ✅ Text substitution (`` tags) +- ✅ Multi-language mixing (`` tags) +- ✅ Sentence/word boundary metadata +- ✅ Proxy support via custom HTTP agent +- ✅ 3 audio output formats (MP3 48/96 kbps, WEBM OPUS) + +**Breaking Changes:** +- ⚠️ December 2025: API now requires Edge User-Agent (browser support dropped) + +**Dependencies:** +- `axios`: ^1.11.0 +- `isomorphic-ws`: ^5.0.0 +- `ws`: ^8.14.1 +- `buffer`: ^6.0.3 +- `stream-browserify`: ^3.0.0 + +--- + +## Related Projects + +- [Azure Speech Service Documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/) +- [SSML Specification](https://www.w3.org/TR/speech-synthesis11/) +- [Microsoft Edge Read Aloud API](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech) + +--- + +## License + +MIT License - See LICENSE file for details. + +--- + +## Support + +- **Issues**: [GitHub Issues](https://github.com/Migushthe2nd/MsEdgeTTS/issues) +- **npm**: [msedge-tts](https://www.npmjs.com/package/msedge-tts) +- **Documentation**: [API Docs](https://migushthe2nd.github.io/MsEdgeTTS) diff --git a/docs/ssml-pronunciation.md b/docs/ssml-pronunciation.md new file mode 100644 index 0000000..52802b1 --- /dev/null +++ b/docs/ssml-pronunciation.md @@ -0,0 +1,205 @@ +# Pronunciation in Speech Synthesis Markup Language (SSML) - Speech Service - Foundry Tools | Microsoft Learn + +Speech Synthesis Markup Language (SSML) can be used with text-to-speech to specify how speech should be pronounced. For example, SSML can be used with phonemes and custom dictionaries to improve pronunciation. + +## Phoneme Element + +The `phoneme` element is used for pronunciation in SSML documents. Always provide human-readable speech as a fallback. + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `alphabet` | Phonetic alphabet. Supported: `ipa`, `sapi`, `ups`, `x-sampa`. | Optional | +| `ph` | Phoneme string containing the pronunciation of the word. | Required | + +### Phoneme Examples + +Using the IPA alphabet: + +```xml + + + tomato + + +``` + +Using the SAPI alphabet: + +```xml + + + en-US + + +``` + +Using the x-sampa alphabet: + +```xml + + + hello + + +``` + +## Custom Dictionary + +Use the `lexicon` element to reference a custom dictionary XML file to define pronunciations for multiple entities. + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `uri` | URI of the custom dictionary XML file (`.xml` or `.pls`). | Required | + +### Custom Dictionary Example + +```xml + + + + BTW, we will be there probably at 8:00 tomorrow morning. + + +``` + +### Custom Dictionary File Format + +```xml + + + + BTW + By the way + + + Benigni + bɛˈniːnji + + + 😀 + test emoji + + +``` + +**Limitations**: +- Maximum file size: 100 KB +- Dictionary cache refreshes every 15 minutes +- One locale per dictionary + +## Say-as Element + +Indicates the content type of the element text (such as numbers, dates, etc.). + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `interpret-as` | Content type. Supported: `characters`, `cardinal`, `ordinal`, `date`, `time`, `currency`, `telephone`, etc. | Required | +| `format` | Exact format (such as `mdy`, `hms12`, etc.). | Optional | +| `detail` | Level of detail for reading. | Optional | + +### Say-as Examples + +```xml + + +

+ Your 1st request was for 1 room + on 10/19/2010 , + with early arrival at 12:35pm . +

+
+
+``` + +### Supported interpret-as Values + +| interpret-as | Description | +| --- | --- | +| `characters`, `spell-out` | Spell out letter by letter | +| `alphanumeric` | Alphanumeric mixed spelling | +| `cardinal`, `number` | Cardinal numbers | +| `ordinal` | Ordinal numbers | +| `number_digit` | Sequence of individual digits | +| `fraction` | Fractions | +| `date` | Dates | +| `time` | Time | +| `duration` | Duration | +| `telephone` | Phone numbers | +| `currency` | Currency | +| `unit` | Units of measurement | +| `address` | Addresses | +| `name` | Personal names | + +## Sub Element + +Use the `sub` element to specify alias text to replace the original element text. + +```xml + + + W3C + + +``` + +## Reading Mathematical Expressions + +### Method 1: Plain Text Mathematical Expressions + +```xml + + + + x = (-b ± √(b² - 4ac)) / 2a + + +``` + +Read out parentheses: + +```xml + + + + x = (-b ± √(b² - 4ac)) / 2a + + +``` + +### Method 2: Using MathML + +```xml + + + + + a + 2 + + + + + b + 2 + + = + + c + 2 + + + + +``` + +Output: "a squared plus b squared equals c squared" + +--- + +**Note**: This documentation is based on Microsoft's official SSML documentation. For the most up-to-date information, please refer to the [Microsoft Azure Speech Service documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation). + +© Microsoft Corporation. All rights reserved. diff --git a/docs/ssml-structure.md b/docs/ssml-structure.md new file mode 100644 index 0000000..b12447c --- /dev/null +++ b/docs/ssml-structure.md @@ -0,0 +1,256 @@ +# Speech Synthesis Markup Language (SSML) Document Structure and Events - Speech Service - Foundry Tools | Microsoft Learn + +Speech Synthesis Markup Language (SSML), used together with input text, determines the structure, content, and other characteristics of text-to-speech output. For example, you can use SSML to define paragraphs, sentences, breaks/pauses, or silence. You can wrap text with event markers (such as bookmarks or visemes) that can be processed later by applications. + +For more information on how to structure elements in an SSML document, see the following sections. + +> **Note** +> +> In addition to Azure Neural (non-HD) voices in Foundry Tools, you can also use [Azure HD (High-Definition) Voices in Foundry Tools](high-definition-voices) and [Azure OpenAI Neural (HD and non-HD) Voices](openai-voices). HD voices provide higher quality for more diverse scenarios. + +Certain voices do not support all [Speech Synthesis Markup Language (SSML)](speech-synthesis-markup-structure) tags. This includes Neural Text-to-Speech HD voices, Personal Voices, and Embedded Voices. + +- For Azure HD voices, check SSML support [here](speech-synthesis-markup-voice). +- For Personal Voices, SSML support can be found [here](personal-voice-how-to-use#supported-and-unsupported-ssml-elements-for-personal-voice). +- For Embedded Voices, check SSML support [here](embedded-speech#embedded-voices-capabilities). + +## Document Structure + +The Speech Service implementation of SSML is based on the World Wide Web Consortium's [Speech Synthesis Markup Language Version 1.0](https://www.w3.org/TR/2004/REC-speech-synthesis-20040907/). The elements supported by Speech Service may differ from the W3C standard. + +Each SSML document is created using SSML elements (or tags). These elements are used to adjust speech, style, syllables, prosody, volume, and more. + +Below is a subset of the basic structure and syntax of an SSML document: + +```xml + + + + + + + + + + + + + + + + +

+ + + + + +
+
+``` + +The following list describes examples of some content allowed within each element: + +- `audio`: If the audio file doesn't exist or can't be played, you can include speakable plain text or SSML tags in the body of the `audio` element. The `audio` element also contains text and the following elements: `audio`, `break`, `p`, `s`, `phoneme`, `prosody`, `say-as`, and `sub`. +- `bookmark`: This element cannot contain text or any other elements. +- `break`: This element cannot contain text or any other elements. +- `emphasis`: This element can contain text and the following elements: `audio`, `break`, `emphasis`, `lang`, `phoneme`, `prosody`, `say-as`, and `sub`. +- `lang`: This element can contain all other elements except `mstts:backgroundaudio`, `voice`, and `speak`. +- `lexicon`: This element cannot contain text or any other elements. +- `math`: This element can contain only text and MathML elements. +- `mstts:audioduration`: This element cannot contain text or any other elements. +- `mstts:backgroundaudio`: This element cannot contain text or any other elements. +- ``: This element cannot contain text or any other elements. It specifies the source audio URL for voice conversion. +- `mstts:embedding`: This element can contain text and the following elements: `audio`, `break`, `emphasis`, `lang`, `phoneme`, `prosody`, `say-as`, and `sub`. +- `mstts:express-as`: This element can contain text and the following elements: `audio`, `break`, `emphasis`, `lang`, `phoneme`, `prosody`, `say-as`, and `sub`. +- `mstts:silence`: This element cannot contain text or any other elements. +- `mstts:viseme`: This element cannot contain text or any other elements. +- `p`: This element can contain text and the following elements: `audio`, `break`, `phoneme`, `prosody`, `say-as`, `sub`, `mstts:express-as`, and `s`. +- `phoneme`: This element can contain only text and cannot contain any other elements. +- `prosody`: This element can contain text and the following elements: `audio`, `break`, `p`, `phoneme`, `prosody`, `say-as`, `sub`, and `s`. +- `s`: This element can contain text and the following elements: `audio`, `break`, `phoneme`, `prosody`, `say-as`, `mstts:express-as`, and `sub`. +- `say-as`: This element can contain only text and cannot contain any other elements. +- `sub`: This element can contain only text and cannot contain any other elements. +- `speak`: The root element of an SSML document. This element can contain the following elements: `mstts:backgroundaudio` and `voice`. +- `voice`: This element can contain all other elements except `mstts:backgroundaudio` and `speak`. + +The Speech Service can automatically handle pauses appropriately (for example, pausing briefly after a period) or use the correct intonation for sentences ending with a question mark. + +## Special Characters + +To use the characters `&`, `<`, and `>` in the values or text of SSML elements, you must use entity formatting. Specifically, you must use `&` instead of `&`, `<` instead of `<`, and `>` instead of `>`. Otherwise, the SSML will not be parsed correctly. + +For example, specify `green & yellow` instead of `green & yellow`. The following SSML will be parsed correctly: + +```xml + + + My favorite colors are green & yellow. + + +``` + +Special characters such as quotation marks, apostrophes, and parentheses must be escaped. For more information, see [Extensible Markup Language (XML) 1.0: Appendix D](https://www.w3.org/TR/xml/#sec-entexpand). + +Attribute values must be enclosed in double or single quotation marks. For example, `` and `` are well-formed and valid elements, but `` will not be recognized. + +## Speak Root Element + +The `speak` element contains information such as version, language, and markup vocabulary definitions. The `speak` element is the required root element for all SSML documents. You must specify the default language within the `speak` element, regardless of whether you adjust that language elsewhere, such as in the [`lang`](speech-synthesis-markup-voice#use-voice-elements) element. + +Below is the syntax for the `speak` element: + +```xml + +``` + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `version` | Indicates the version of the SSML specification used to interpret the document markup. The current version is "1.0". | Required | +| `xml:lang` | The language of the root document. The value can contain a language code (such as `en` for English) or locale information such as `en-US` (English - United States). | Required | +| `xmlns` | The URI for the document that defines the markup vocabulary (element types and attribute names) of the SSML document. The current URI is "http://www.w3.org/2001/10/synthesis". | Required | + +The `speak` element must contain at least one [voice element](speech-synthesis-markup-voice#use-voice-elements). + +### Speak Examples + +The following introduces the values supported by the `speak` element attributes. + +#### Single Voice Example + +This example uses the `en-US-Ava:DragonHDLatestNeural` voice. For more examples, see [Voice Examples](speech-synthesis-markup-voice#voice-examples). + +```xml + + + This is the text that is spoken. + + +``` + +## Adding Breaks + +Use the `break` element to override the default break or pause behavior between words. Otherwise, the Speech Service will automatically insert pauses. + +The following table describes the attribute usage for the `break` element. + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `strength` | The relative duration of the pause, using one of the following values:
- x-weak
- weak
- medium (default)
- strong
- x-strong | Optional | +| `time` | The absolute duration of the pause, in seconds (for example `2s`) or milliseconds (for example `500ms`). Valid values range from 0 to 20000 milliseconds. If the set value is greater than the supported maximum, the service will use `20000ms`. If the `time` attribute is set, the `strength` attribute is ignored. | Optional | + +Below are more details about the `strength` attribute. + +| Strength | Relative Duration | +| --- | --- | +| x-weak | 250 ms | +| weak | 500 ms | +| medium | 750 ms | +| strong | 1,000 ms | +| x-strong | 1,250 ms | + +### Break Examples + +The following introduces the values supported by the `break` element attributes. All three methods below add a 750ms break. + +```xml + + + Welcome to text to speech. + Welcome to text to speech. + Welcome to text to speech. + + +``` + +## Adding Silence + +Use the `mstts:silence` element to add pauses before or after text, or between two adjacent sentences. + +One difference between `mstts:silence` and `break` is that the `break` element can be inserted anywhere in the text. Silence applies only to the beginning or end of input text, or at the boundary between two adjacent sentences. + +The silence setting applies to all input text within the `voice` element where it is located. To reset or change the silence setting again, you must use a new `voice` element containing the same or a different voice. + +The following table describes the attribute usage for the `mstts:silence` element. + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `type` | Specifies where and how silence is added. The following silence types are supported:
- `Leading` – Additional silence at the beginning of text. The set value is added to the natural silence before the beginning of the text.
- `Leading-exact` – Silence at the beginning of text. The value is the absolute silence length.
- `Tailing` – Additional silence at the end of text. The set value is added to the natural silence after the last word.
- `Tailing-exact` – Silence at the end of text. The value is the absolute silence length.
- `Sentenceboundary` – Additional silence between adjacent sentences. The actual silence length for this type includes the natural silence after the last word of the previous sentence, the value set for this type, and the natural silence before the starting word of the next sentence.
- `Sentenceboundary-exact` – Silence between adjacent sentences. The value is the absolute silence length.
- `Comma-exact` – Silence at half-width or full-width commas. The value is the absolute silence length.
- `Semicolon-exact` – Silence at half-width or full-width semicolons. The value is the absolute silence length.
- `Enumerationcomma-exact` – Silence at full-width enumeration commas. The value is the absolute silence length.

Absolute silence types (with the `-exact` suffix) replace any other natural leading or trailing silence. Absolute silence types take precedence over their corresponding non-absolute types. For example, if both `Leading` and `Leading-exact` types are set, the `Leading-exact` type takes effect. [WordBoundary events](how-to-speech-synthesis#subscribe-to-synthesizer-events) take precedence over punctuation-related silence settings, including `Comma-exact`, `Semicolon-exact`, or `Enumerationcomma-exact`. When using both `WordBoundary` events and punctuation-related silence settings, the punctuation-related silence settings will not take effect. | Required | +| `value` | The pause duration, in seconds (for example `2s`) or milliseconds (for example `500ms`). Valid values range from 0 to 20000 milliseconds. If the set value is greater than the supported maximum, the service will use `20000ms`. | Required | + +### MSTTS Silence Examples + +The following introduces the values supported by the `mstts:silence` element attributes. + +In this example, `mstts:silence` is used to add 200ms of silence between two sentences. + +```xml + + + +If we're home schooling, the best we can do is roll with what each day brings and try to have fun along the way. +A good place to start is by trying out the slew of educational apps that are helping children stay happy and smash their schooling at the same time. + + +``` + +In this example, `mstts:silence` is used to add 50ms of silence at commas, 100ms of silence at semicolons, and 150ms of silence at enumeration commas. + +```xml + + +你好呀,云希、晓晓;你好呀。 + + +``` + +## Specifying Paragraphs and Sentences + +The `p` and `s` elements are used to represent paragraphs and sentences, respectively. If these elements are missing, the Speech Service will automatically determine the structure of the SSML document. + +### Paragraphs and Sentences Example + +The following example defines two paragraphs, where each paragraph contains sentences. In the second paragraph, the Speech Service automatically determines the sentence structure because they are not explicitly defined in the SSML document. + +```xml + + +

+ Introducing the sentence element. + Used to mark individual sentences. +

+

+ Another simple paragraph. + Sentence structure in this paragraph is not explicitly marked. +

+
+
+``` + +## Bookmark Element + +You can use the `bookmark` element in SSML to reference specific positions in text or a sequence of tags. Then use the Speech SDK and subscribe to the `BookmarkReached` event to get the offset of each bookmark in the audio stream. The `bookmark` element is not spoken aloud. For more information, see [Subscribe to Synthesizer Events](how-to-speech-synthesis#subscribe-to-synthesizer-events). + +The following table describes the attribute usage for the `bookmark` element. + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `mark` | The reference text for the `bookmark` element. | Required | + +### Bookmark Example + +The following introduces the values supported by the `bookmark` element attributes. + +You might want to know the time offset of each flower-related word in the following code snippet: + +```xml + + + We are selling roses and daisies. + + +``` + +--- + +*This documentation is adapted from Microsoft Azure Speech Service official documentation. All SSML specifications and element descriptions are based on Microsoft's technical documentation.* diff --git a/docs/ssml-voice.md b/docs/ssml-voice.md new file mode 100644 index 0000000..2f97a03 --- /dev/null +++ b/docs/ssml-voice.md @@ -0,0 +1,238 @@ +# Voice and Sounds in Speech Synthesis Markup Language (SSML) - Speech Service - Foundry Tools | Microsoft Learn + +You can use Speech Synthesis Markup Language (SSML) to specify the voice, language, name, style, and role for text-to-speech output. You can also use multiple voices in a single SSML document and adjust stress, speech rate, pitch, and volume. Additionally, SSML allows insertion of pre-recorded audio, such as sound effects or musical notes. + +This article describes how to use SSML elements to specify voice and sounds. For more information about SSML syntax, see [SSML document structure and events](speech-synthesis-markup-structure). + +## Using the voice element + +You must specify at least one `name` attribute in each SSML `voice` element. This attribute determines the voice used for text-to-speech. + +You can include multiple `voice` elements in a single SSML document. Each `voice` element can specify a different voice. You can also use the same voice multiple times with different settings, for example, when [changing the duration of silence between sentences](speech-synthesis-markup-structure#add-silence). + +The following table describes the usage of `voice` element attributes: + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `name` | The voice used for text-to-speech output. For a complete list of supported standard voices, see [Language support](language-support?tabs=tts). | Required | +| `effect` | Audio effect processor used to optimize the quality of synthesized speech output on devices for specific scenarios. In certain production scenarios, the listening experience may be degraded due to playback distortion on certain devices. For example, synthesized speech from car speakers may sound dull and muffled due to environmental factors such as speaker response, room reverberation, and background noise. Passengers may have to turn up the volume to hear more clearly. To avoid manual operation in this situation, the audio effect processor can make the voice clearer by compensating for playback distortion. The following values are supported:
- `eq_car` - Optimizes the listening experience when delivering high-fidelity speech in cars, buses, and other enclosed vehicles.
- `eq_telecomhp8k` - Optimizes the listening experience for narrowband speech in telecommunications or telephony scenarios. A sample rate of 8 kHz should be used. If the sample rate is not 8 kHz, the listening quality of the output speech will not be optimized.

If the value is missing or invalid, this attribute is ignored and no effect is applied. | Optional | + +### Voice examples + +#### Single voice example + +```xml + + + This is the text that is spoken. + + +``` + +#### Multiple voices example + +```xml + + + Good morning! + + + Good morning to you too Ava! + + +``` + +#### Audio effect example + +```xml + + + This is the text that is spoken. + + +``` + +#### Multi-speaker voice example + +```xml + + + + Hello, Andrew! How's your day going? + Hey Ava! It's been great, just exploring some AI advancements in communication. + + + +``` + +## Using speaking styles and roles + +By default, neural voices use a neutral speaking style. You can adjust the speaking style, style intensity, and role at the sentence level. + +The following table describes the usage of `mstts:express-as` element attributes: + +| Attribute | Description | Required or Optional | +| --- | --- | --- | +| `style` | The speaking style for a specific voice. Can express emotions such as happiness, sympathy, and calmness. | Required | +| `styledegree` | The intensity of the speaking style. Acceptable values range from `0.01` to `2` (inclusive). Default value is `1`. | Optional | +| `role` | Role-playing when speaking. Voices can imitate different ages and genders. | Optional | + +### Supported styles + +| Style | Description | +| --- | --- | +| `advertisement_upbeat` | Promote products or services with an excited and energetic tone. | +| `affectionate` | Express warm and affectionate tone with higher pitch and volume. | +| `angry` | Express angry and disgusted tone. | +| `assistant` | Speak in a warm and relaxed tone, used for digital assistants. | +| `calm` | Speak with composure and calmness. | +| `chat` | Express a relaxed and casual tone. | +| `cheerful` | Express a positive and pleasant tone. | +| `customerservice` | Provide support to customers with a friendly and enthusiastic tone. | +| `depressed` | Express melancholy and depressed tone with lower pitch and volume. | +| `documentary-narration` | Narrate documentaries in a relaxed, interested, and informative style. | +| `empathetic` | Express care and understanding. | +| `excited` | Express an optimistic and hopeful tone. | +| `fearful` | Express fear with higher pitch, higher volume, and faster speech rate. | +| `friendly` | Express a pleasant, charming, and warm tone. | +| `gentle` | Express a mild, polite, and pleasant tone with lower pitch and volume. | +| `hopeful` | Speak in a warm and longing tone. | +| `lyrical` | Express emotions in a graceful and slightly sentimental way. | +| `narration-professional` | Read content in a professional and objective tone. | +| `narration-relaxed` | Speak in a soothing and pleasant tone, used for content narration. | +| `newscast` | Narrate news in a formal and professional tone. | +| `newscast-casual` | Deliver general news in a common, casual tone. | +| `newscast-formal` | Deliver news in a formal, confident, and authoritative tone. | +| `poetry-reading` | Express emotional and rhythmic tone when reading poetry. | +| `sad` | Express a sorrowful tone. | +| `serious` | Express a serious and commanding tone. | +| `shouting` | Sound as if speaking from a distance or in another location. | +| `sports_commentary` | Express a relaxed yet interested tone for broadcasting sports events. | +| `sports_commentary_excited` | Broadcast sports event highlights with a fast and energetic tone. | +| `terrified` | Express a fearful tone with fast speech rate and trembling voice. | +| `unfriendly` | Express a cold and indifferent tone. | +| `whispering` | Speak in a soft tone trying to produce a gentle and mild sound. | + +### Supported roles + +| Role | Description | +| --- | --- | +| `Girl` | Voice imitates a girl. | +| `Boy` | Voice imitates a boy. | +| `YoungAdultFemale` | Voice imitates a young adult female. | +| `YoungAdultMale` | Voice imitates a young adult male. | +| `OlderAdultFemale` | Voice imitates an older adult female. | +| `OlderAdultMale` | Voice imitates an older adult male. | +| `SeniorFemale` | Voice imitates an elderly female. | +| `SeniorMale` | Voice imitates an elderly male. | + +### Style and style degree examples + +```xml + + + + Hurry up, be careful on the road, and come back early. + + + +``` + +### Role examples + +```xml + + + The daughter saw her father walk in and asked: + + "You came pretty fast, how did you get here?" + + The father put down his bag and said: + + "I just took a taxi, the traffic was smooth." + + + +``` + +## Adjusting speaking language + +Use the `` element to adjust the speaking language for multilingual voices. + +```xml + + + + Wir freuen uns auf die Zusammenarbeit mit Ihnen! + + + +``` + +## Adjusting prosody + +Use the `prosody` element to specify variations in pitch, intonation, range, speech rate, and volume. + +| Attribute | Description | +| --- | --- | +| `contour` | Contour curve representing pitch variations. | +| `pitch` | Baseline pitch. Available values: `x-low`, `low`, `medium`, `high`, `x-high`, or relative values (e.g., `+20Hz`, `-2st`). | +| `range` | Pitch range. | +| `rate` | Speech rate. Available values: `x-slow`, `slow`, `medium`, `fast`, `x-fast`, or relative values (e.g., `+30%`). | +| `volume` | Volume level. Available values: `silent`, `x-soft`, `soft`, `medium`, `loud`, `x-loud`, or relative values (e.g., `+20`). | + +### Prosody example + +```xml + + + + Enjoy using text to speech. + + + +``` + +## Adding recorded audio + +```xml + + + + +``` + +## Adding background audio + +```xml + + + + The text provided in this document are spoken over the background audio. + + +``` + +## Voice conversion element + +```xml + + + + + +``` + +--- + +## Related Links + +- [Microsoft Azure Speech Service Documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/) +- [SSML Specification](https://www.w3.org/TR/speech-synthesis11/) +- [Language Support for Text-to-Speech](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=tts) + +--- + +*This documentation is translated from Microsoft official documentation. All rights reserved to Microsoft.* diff --git a/example/.gitignore b/example/.gitignore new file mode 100644 index 0000000..ef0eb55 --- /dev/null +++ b/example/.gitignore @@ -0,0 +1,4 @@ +config.json +output/ +*.mp3 +.DS_Store diff --git a/example/00-simple-dialogue-demo.ts b/example/00-simple-dialogue-demo.ts new file mode 100644 index 0000000..91f727e --- /dev/null +++ b/example/00-simple-dialogue-demo.ts @@ -0,0 +1,95 @@ +import * as fs from "fs"; +import * as path from "path"; + +/** + * Example 0: Simple Dialogue Demo + * Directly use the given SSML example (daughter-father conversation) + */ +async function main() { + // Output decorative box + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 0: Simple Dialogue Demo ║"); + console.log("╚═══════════════════════════════════════════════╝"); + console.log(); + + // Read configuration + const configPath = path.join(__dirname, "config.json"); + if (!fs.existsSync(configPath)) { + console.error("❌ Error: config.json does not exist"); + console.error("📝 Please copy config.example.json to config.json and fill in your email and password"); + console.error(`📁 Example file location: ${configPath}`); + process.exit(1); + } + + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + + // Given SSML example: daughter-father conversation + const ssml = ` + + 女儿看见父亲走了进来,问道: + + "您来的挺快的,怎么过来的?" + + 父亲放下手提包,说: + + "刚打车过来的,路上还挺顺畅。" + + +`; + + // Display the complete SSML + console.log("SSML Used:"); + console.log("┌──────────────────────────────────────────────┐"); + const ssmlLines = ssml.split("\n"); + for (const line of ssmlLines) { + const truncated = line.length > 44 ? line.substring(0, 41) + "..." : line; + console.log(`│ ${truncated.padEnd(44)} │`); + } + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Output path + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + const outputPath = path.join(outputDir, "00-simple-dialogue-demo.mp3"); + + // Call TTS API + console.log("Calling TTS API..."); + + try { + const response = await fetch(config.api_url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + user_email: config.user_email, + user_pass: config.user_pass, + ssml: ssml, + kbitrate: config.kbitrate || "audio-16khz-32kbitrate-mono-mp3", + }), + }); + + if (!response.ok) { + throw new Error(`API request failed: ${response.status} ${response.statusText}`); + } + + // Save file + const buffer = Buffer.from(await response.arrayBuffer()); + fs.writeFileSync(outputPath, buffer); + + // Calculate file size + const fileSizeKB = (buffer.length / 1024).toFixed(1); + + console.log("✅ Audio generation successful!"); + console.log(`📁 File saved: ${outputPath}`); + console.log(`📊 File size: ${fileSizeKB} KB`); + } catch (error) { + console.error("❌ Generation failed:", error instanceof Error ? error.message : error); + process.exit(1); + } +} + +main(); diff --git a/example/01-multi-speaker-dialogue-chained.ts b/example/01-multi-speaker-dialogue-chained.ts new file mode 100644 index 0000000..da7a1e5 --- /dev/null +++ b/example/01-multi-speaker-dialogue-chained.ts @@ -0,0 +1,112 @@ +import { DialogueBuilder, buildDialogueSSML, type DialogueTurn } from "../src"; +import * as fs from "fs"; +import * as path from "path"; + +/** + * Example 1: Multi-Speaker Dialogue - Chained Call + * Build Chinese-English mixed podcast dialogue using DialogueBuilder + */ +async function main() { + // Output decorative box + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 1: Multi-Speaker Dialogue - Chained ║"); + console.log("╚═══════════════════════════════════════════════╝"); + console.log(); + + // Read configuration + const configPath = path.join(__dirname, "config.json"); + if (!fs.existsSync(configPath)) { + console.error("❌ Error: config.json does not exist"); + console.error("📝 Please copy config.example.json to config.json and fill in email and password"); + console.error(`📁 Example file location: ${configPath}`); + process.exit(1); + } + + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + + // Build dialogue: 4 speaker turns (2 Chinese + 2 English) + const dialogue = new DialogueBuilder() + .addTurn({ + voice: "zh-CN-XiaoxiaoNeural", + text: "大家好!欢迎收听今天的科技播客。", + style: "cheerful", + }) + .addTurn({ + voice: "en-US-AndrewNeural", + text: "Hello everyone! Welcome to today's tech podcast.", + lang: "en-US", + style: "friendly", + }) + .addTurn({ + voice: "zh-CN-YunxiNeural", + text: "今天我们将探讨人工智能的最新发展。", + style: "documentary-narration", + }) + .addTurn({ + voice: "en-US-AriaNeural", + text: "That's right! AI is changing the world faster than ever.", + lang: "en-US", + style: "excited", + }) + .build(); + + console.log(`Generated dialogue turns: ${dialogue.turns.length}`); + console.log(); + + // Generate SSML + const ssml = buildDialogueSSML(dialogue.turns); + + // SSML preview + console.log("SSML Preview:"); + console.log("┌──────────────────────────────────────────────┐"); + const ssmlLines = ssml.split("\n"); + for (const line of ssmlLines) { + const truncated = line.length > 44 ? line.substring(0, 41) + "..." : line; + console.log(`│ ${truncated.padEnd(44)} │`); + } + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Output path + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + const outputPath = path.join(outputDir, "podcast-dialogue-chained.mp3"); + + // Call TTS API + console.log("Calling TTS API..."); + + try { + const response = await fetch(config.api_url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + user_email: config.user_email, + user_pass: config.user_pass, + ssml: ssml, + kbitrate: config.kbitrate || "audio-16khz-32kbitrate-mono-mp3", + }), + }); + + if (!response.ok) { + throw new Error(`API request failed: ${response.status} ${response.statusText}`); + } + + // Save file + const buffer = Buffer.from(await response.arrayBuffer()); + fs.writeFileSync(outputPath, buffer); + + // Calculate file size + const fileSizeKB = (buffer.length / 1024).toFixed(1); + + console.log("✅ Audio generation successful!"); + console.log(`📁 File saved: ${outputPath}`); + console.log(`📊 File size: ${fileSizeKB} KB`); + } catch (error) { + console.error("❌ Generation failed:", error instanceof Error ? error.message : error); + process.exit(1); + } +} + +main(); diff --git a/example/02-multi-speaker-dialogue-functional.ts b/example/02-multi-speaker-dialogue-functional.ts new file mode 100644 index 0000000..af302e5 --- /dev/null +++ b/example/02-multi-speaker-dialogue-functional.ts @@ -0,0 +1,112 @@ +import { buildDialogueSSML, type DialogueTurn } from "../src"; +import * as fs from "fs"; +import * as path from "path"; + +/** + * Example 2: Multi-Speaker Dialogue - Functional + * Build Chinese-English mixed customer service dialogue using buildDialogueSSML function + */ +async function main() { + // Output decorative box + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 2: Multi-Speaker Dialogue - Functional ║"); + console.log("╚═══════════════════════════════════════════════╝"); + console.log(); + + // Read configuration + const configPath = path.join(__dirname, "config.json"); + if (!fs.existsSync(configPath)) { + console.error("❌ Error: config.json does not exist"); + console.error("📝 Please copy config.example.json to config.json and fill in your email and password"); + console.error(`📁 Example file location: ${configPath}`); + process.exit(1); + } + + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + + // Build dialogue: 4 speaker turns (2 Chinese customer service + 2 English customer service) + const turns: DialogueTurn[] = [ + { + voice: "zh-CN-XiaoxiaoNeural", + text: "您好!欢迎联系客户服务中心。", + style: "customerservice", + }, + { + voice: "en-US-JennyNeural", + text: "Hello! Welcome to customer service.", + lang: "en-US", + style: "friendly", + }, + { + voice: "zh-CN-YunjianNeural", + text: "请问有什么可以帮助您的?", + style: "assistant", + }, + { + voice: "en-US-GuyNeural", + text: "How can I help you today?", + lang: "en-US", + style: "assistant", + }, + ]; + + console.log(`Building dialogue turns: ${turns.length}`); + console.log(); + + // Generate SSML + const ssml = buildDialogueSSML(turns); + + // SSML preview + console.log("SSML Preview:"); + console.log("┌──────────────────────────────────────────────┐"); + const ssmlLines = ssml.split("\n"); + for (const line of ssmlLines) { + const truncated = line.length > 44 ? line.substring(0, 41) + "..." : line; + console.log(`│ ${truncated.padEnd(44)} │`); + } + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Output path + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + const outputPath = path.join(outputDir, "02-customer-service-dialogue-functional.mp3"); + + // Call TTS API + console.log("Calling TTS API..."); + + try { + const response = await fetch(config.api_url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + user_email: config.user_email, + user_pass: config.user_pass, + ssml: ssml, + kbitrate: config.kbitrate || "audio-16khz-32kbitrate-mono-mp3", + }), + }); + + if (!response.ok) { + throw new Error(`API request failed: ${response.status} ${response.statusText}`); + } + + // Save file + const buffer = Buffer.from(await response.arrayBuffer()); + fs.writeFileSync(outputPath, buffer); + + // Calculate file size + const fileSizeKB = (buffer.length / 1024).toFixed(1); + + console.log("✅ Audio generation successful!"); + console.log(`📁 File saved: ${outputPath}`); + console.log(`📊 File size: ${fileSizeKB} KB`); + } catch (error) { + console.error("❌ Generation failed:", error instanceof Error ? error.message : error); + process.exit(1); + } +} + +main(); diff --git a/example/03-31-emotional-styles-demo.ts b/example/03-31-emotional-styles-demo.ts new file mode 100644 index 0000000..22863b8 --- /dev/null +++ b/example/03-31-emotional-styles-demo.ts @@ -0,0 +1,104 @@ +/** + * Example 3: 31 Emotional Styles Demo + * + * Demonstrates all 31 emotional styles supported by Microsoft Azure Speech Service. + * Each style is showcased with a sample sentence. + */ + +import { MsEdgeTTS, OUTPUT_FORMAT, buildDialogueSSML, type DialogueTurn } from "../src"; +import * as fs from "fs"; +import * as path from "path"; + +const allStyles = [ + "advertisement_upbeat", "affectionate", "angry", "assistant", + "calm", "chat", "cheerful", "customerservice", + "depressed", "documentary-narration", "empathetic", "excited", + "fearful", "friendly", "gentle", "hopeful", + "lyrical", "narration-professional", "narration-relaxed", "newscast", + "newscast-casual", "newscast-formal", "poetry-reading", "sad", + "serious", "shouting", "sports_commentary", "sports_commentary_excited", + "terrified", "unfriendly", "whispering" +]; + +function printStyleTable(styles: string[]): void { + console.log("\nComplete Emotional Styles List:"); + console.log("┌────┬─────────────────────────────────────┐"); + console.log("│ No. │ Style Name │"); + console.log("├────┼─────────────────────────────────────┤"); + + styles.forEach((style, index) => { + const num = String(index + 1).padStart(2, ' '); + const paddedStyle = style.padEnd(35, ' '); + console.log(`│ ${num} │ ${paddedStyle}│`); + }); + + console.log("└────┴─────────────────────────────────────┘"); +} + +async function main(): Promise { + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 3: 31 Emotional Styles Demo ║"); + console.log("╚═══════════════════════════════════════════════╝"); + + printStyleTable(allStyles); + + const configPath = path.join(__dirname, "config.json"); + let email: string; + let password: string; + + try { + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + email = config.email; + password = config.password; + } catch (error) { + console.error("Error: Unable to read config.json. Please ensure the config file exists."); + console.error("Tip: Copy config.example.json to config.json and fill in your email and password."); + process.exit(1); + } + + const tts = new MsEdgeTTS(); + const voiceName = "zh-CN-XiaoxiaoNeural"; + const outputFormat = OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3; + + console.log(`\nUsing voice: ${voiceName}`); + console.log(`Output format: MP3`); + + const turns: DialogueTurn[] = allStyles.map((style, index) => ({ + voice: voiceName, + text: `This is style number ${index + 1}: ${style}.`, + style: style + })); + + const ssml = buildDialogueSSML(turns); + console.log(`\nGenerated SSML length: ${ssml.length} characters`); + + try { + await tts.setMetadata(voiceName, outputFormat); + + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + const outputPath = path.join(outputDir, "03-31-emotional-styles-demo.mp3"); + + console.log(`\nGenerating audio...`); + const { audioFilePath } = await tts.toFile(outputDir, ssml); + + fs.renameSync(audioFilePath, outputPath); + + console.log(`\n✅ Audio saved to: ${outputPath}`); + console.log(`✅ Generated ${allStyles.length} emotional style demonstrations`); + + } catch (error) { + console.error("\n❌ Error generating audio:"); + if (error instanceof Error) { + console.error(error.message); + } else { + console.error(error); + } + process.exit(1); + } +} + +main().catch(console.error); diff --git a/example/04-style-degree-control-demo.ts b/example/04-style-degree-control-demo.ts new file mode 100644 index 0000000..5acfb66 --- /dev/null +++ b/example/04-style-degree-control-demo.ts @@ -0,0 +1,127 @@ +import { buildDialogueSSML, type DialogueTurn } from "../src"; +import * as fs from "fs"; +import * as path from "path"; + +/** + * Example 4: Style Degree Control Demo + * Demonstrates the effect of styleDegree parameter (range 0.01-2.0) on emotional expression + */ +async function main() { + // Output decorative box + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 4: Style Degree Control Demo ║"); + console.log("╚═══════════════════════════════════════════════╝"); + console.log(); + + // Read configuration + const configPath = path.join(__dirname, "config.json"); + if (!fs.existsSync(configPath)) { + console.error("❌ Error: config.json does not exist"); + console.error("📝 Please copy config.example.json to config.json and fill in your email and password"); + console.error(`📁 Example file location: ${configPath}`); + process.exit(1); + } + + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + + // Output styleDegree explanation + console.log("📖 styleDegree Parameter Explanation:"); + console.log("┌──────────────────────────────────────────────┐"); + console.log("│ Range: 0.01 - 2.0 │"); + console.log("│ 0.5: Weaker emotional expression │"); + console.log("│ 1.0: Normal emotional expression (default) │"); + console.log("│ 2.0: Strongest emotional expression │"); + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Build dialogue: same sentence, three different intensities + const turns: DialogueTurn[] = [ + { + voice: "zh-CN-XiaomoNeural", + text: "This is normal", + style: "sad", + styleDegree: 0.5, // Weaker + }, + { + voice: "zh-CN-XiaomoNeural", + text: "This is really sad", + style: "sad", + styleDegree: 1.0, // Normal + }, + { + voice: "zh-CN-XiaomoNeural", + text: "This is absolutely heartbreaking!", + style: "sad", + styleDegree: 2.0, // Strongest + }, + ]; + + // Display dialogue content + console.log("📝 Dialogue Content:"); + console.log("┌──────────────────────────────────────────────┐"); + turns.forEach((turn, index) => { + const intensity = turn.styleDegree === 0.5 ? "Weaker" : turn.styleDegree === 1.0 ? "Normal" : "Strongest"; + console.log(`│ ${index + 1}. [Intensity: ${intensity}] ${turn.text.padEnd(25)} │`); + }); + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Generate SSML + const ssml = buildDialogueSSML(turns); + + // SSML preview + console.log("📄 SSML Preview:"); + console.log("┌──────────────────────────────────────────────┐"); + const ssmlLines = ssml.split("\n"); + for (const line of ssmlLines) { + const truncated = line.length > 44 ? line.substring(0, 41) + "..." : line; + console.log(`│ ${truncated.padEnd(44)} │`); + } + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Output path + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + const outputPath = path.join(outputDir, "04-style-degree-control-demo.mp3"); + + // Call TTS API + console.log("🎙️ Calling TTS API..."); + + try { + const response = await fetch(config.api_url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + user_email: config.user_email, + user_pass: config.user_pass, + ssml: ssml, + kbitrate: config.kbitrate || "audio-16khz-32kbitrate-mono-mp3", + }), + }); + + if (!response.ok) { + throw new Error(`API request failed: ${response.status} ${response.statusText}`); + } + + // Save file + const buffer = Buffer.from(await response.arrayBuffer()); + fs.writeFileSync(outputPath, buffer); + + // Calculate file size + const fileSizeKB = (buffer.length / 1024).toFixed(1); + + console.log("✅ Audio generation successful!"); + console.log(`📁 File saved: ${outputPath}`); + console.log(`📊 File size: ${fileSizeKB} KB`); + console.log(); + console.log("💡 Tip: Play the audio to compare the differences between the three emotional intensities"); + } catch (error) { + console.error("❌ Generation failed:", error instanceof Error ? error.message : error); + process.exit(1); + } +} + +main(); diff --git a/example/05-text-substitution-demo.ts b/example/05-text-substitution-demo.ts new file mode 100644 index 0000000..254f1dd --- /dev/null +++ b/example/05-text-substitution-demo.ts @@ -0,0 +1,153 @@ +import { buildDialogueSSML, type DialogueTurn } from "../src"; +import * as fs from "fs"; +import * as path from "path"; + +/** + * Example 5: Text Substitution Demo + * Demonstrates the substitutions parameter with technical term replacements (W3C, HTTP, CEO, etc.) + */ +async function main() { + // Output decorative box + console.log("╔═══════════════════════════════════════════════╗"); + console.log("║ Example 5: Text Substitution Demo ║"); + console.log("╚═══════════════════════════════════════════════╝"); + console.log(); + + // Read configuration + const configPath = path.join(__dirname, "config.json"); + if (!fs.existsSync(configPath)) { + console.error("❌ Error: config.json does not exist"); + console.error("📝 Please copy config.example.json to config.json and fill in email and password"); + console.error(`📁 Example file location: ${configPath}`); + process.exit(1); + } + + const config = JSON.parse(fs.readFileSync(configPath, "utf-8")); + + // Output substitutions explanation + console.log("📖 substitutions Parameter Explanation:"); + console.log("┌──────────────────────────────────────────────┐"); + console.log("│ Format: { text: string, alias: string } │"); + console.log("│ text: The word in the original text │"); + console.log("│ alias: The alias used during reading │"); + console.log("│ SSML generates text tag│"); + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Build dialogue: demonstrate technical term substitution + const turns: DialogueTurn[] = [ + { + voice: "zh-CN-XiaoxiaoNeural", + text: "W3C develops Web standards, API is based on HTTP protocol", + substitutions: [ + { text: "W3C", alias: "World Wide Web Consortium" }, + { text: "Web", alias: "World Wide Web" }, + { text: "HTTP", alias: "Hypertext Transfer Protocol" }, + ], + style: "narration-professional", + }, + { + voice: "en-US-AndrewNeural", + text: "The CEO said: innovation drives success", + substitutions: [ + { text: "CEO", alias: "Chief Executive Officer" }, + ], + style: "newscast-formal", + lang: "en-US", + }, + ]; + + // Display before/after substitution comparison + console.log("📝 Before/After Substitution Comparison:"); + console.log("┌──────────────────────────────────────────────┐"); + console.log("│ [Chinese Part] │"); + console.log("│ Original: W3C develops Web standards, API is │"); + console.log("│ based on HTTP protocol │"); + console.log("│ Reading: World Wide Web Consortium develops │"); + console.log("│ World Wide Web standards, API is │"); + console.log("│ based on Hypertext Transfer Protocol│"); + console.log("├──────────────────────────────────────────────┤"); + console.log("│ [English Part] │"); + console.log("│ Original: The CEO said: innovation drives │"); + console.log("│ success │"); + console.log("│ Reading: The Chief Executive Officer said: │"); + console.log("│ innovation drives success │"); + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Display substitution rules list + console.log("📋 Substitution Rules List:"); + console.log("┌──────────────────────────────────────────────┐"); + console.log("│ Chinese Part Substitution Rules: │"); + turns[0].substitutions?.forEach((sub) => { + const line = `│ "${sub.text}" → "${sub.alias}"`.padEnd(47) + "│"; + console.log(line); + }); + console.log("├──────────────────────────────────────────────┤"); + console.log("│ English Part Substitution Rules: │"); + turns[1].substitutions?.forEach((sub) => { + const line = `│ "${sub.text}" → "${sub.alias}"`.padEnd(47) + "│"; + console.log(line); + }); + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Generate SSML + const ssml = buildDialogueSSML(turns); + + // SSML Preview + console.log("📄 SSML Preview:"); + console.log("┌──────────────────────────────────────────────┐"); + const ssmlLines = ssml.split("\n"); + for (const line of ssmlLines) { + const truncated = line.length > 44 ? line.substring(0, 41) + "..." : line; + console.log(`│ ${truncated.padEnd(44)} │`); + } + console.log("└──────────────────────────────────────────────┘"); + console.log(); + + // Output path + const outputDir = path.join(__dirname, "output"); + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + const outputPath = path.join(outputDir, "05-text-substitution-demo.mp3"); + + // Call TTS API + console.log("🎙️ Calling TTS API..."); + + try { + const response = await fetch(config.api_url, { + method: "POST", + headers: { "Content-Type": "application/x-www-form-urlencoded" }, + body: new URLSearchParams({ + user_email: config.user_email, + user_pass: config.user_pass, + ssml: ssml, + kbitrate: config.kbitrate || "audio-16khz-32kbitrate-mono-mp3", + }), + }); + + if (!response.ok) { + throw new Error(`API request failed: ${response.status} ${response.statusText}`); + } + + // Save file + const buffer = Buffer.from(await response.arrayBuffer()); + fs.writeFileSync(outputPath, buffer); + + // Calculate file size + const fileSizeKB = (buffer.length / 1024).toFixed(1); + + console.log("✅ Audio generation successful!"); + console.log(`📁 File saved: ${outputPath}`); + console.log(`📊 File size: ${fileSizeKB} KB`); + console.log(); + console.log("💡 Tip: Play the audio to compare the reading effect before and after substitution"); + } catch (error) { + console.error("❌ Generation failed:", error instanceof Error ? error.message : error); + process.exit(1); + } +} + +main(); diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..c5cb3ce --- /dev/null +++ b/example/README.md @@ -0,0 +1,145 @@ +# TTS Pro API Example Code + +## Quick Start + +### 1. Configure Account Information + +Copy the configuration template and fill in your email and password: + +```bash +cp config.example.json config.json +``` + +Edit `config.json`: +```json +{ + "user_email": "your-email@example.com", + "user_pass": "your-password", + "api_url": "https://ttspro.cn/getSpeek.php", + "kbitrate": "audio-16khz-32kbitrate-mono-mp3", + "output_format": "binary" +} +``` + +### 2. Build Project + +```bash +pnpm run build +``` + +### 3. Run Examples + +```bash +# Example 1: Multi-Speaker Dialogue (Chained) +node example/01-multi-speaker-dialogue-chained.ts + +# Example 2: Multi-Speaker Dialogue (Functional) +node example/02-multi-speaker-dialogue-functional.ts + +# Example 3: 31 Emotional Styles Demo +node example/03-31-emotional-styles-demo.ts + +# Example 4: Style Degree Control Demo +node example/04-style-degree-control-demo.ts + +# Example 5: Text Substitution Demo +node example/05-text-substitution-demo.ts +``` + +## Example Descriptions + +### Example 1: Multi-Speaker Dialogue (Chained) + +Build dialogue using the `DialogueBuilder` class with chained calls. + +**Features**: +- Chained call syntax +- Chinese-English mixed podcast scenario +- 4 speaker turns + +**Output**: `example/output/01-multi-speaker-dialogue-chained.mp3` + +### Example 2: Multi-Speaker Dialogue (Functional) + +Build dialogue using the `buildDialogueSSML()` function. + +**Features**: +- Functional syntax +- Multi-language customer service dialogue +- 4 dialogue turns + +**Output**: `example/output/02-multi-speaker-dialogue-functional.mp3` + +### Example 3: 31 Emotional Styles Demo + +Demonstrate all 31 emotional styles supported by Microsoft Azure. + +**Features**: +- Complete list of 31 styles +- One example sentence per style +- Table format presentation + +**Output**: `example/output/03-31-emotional-styles-demo.mp3` + +### Example 4: Style Degree Control Demo + +Demonstrate the `styleDegree` parameter (range: 0.01-2.0). + +**Features**: +- Three intensity levels: 0.5/1.0/2.0 +- Uses `sad` emotional style +- Same voice with different intensities + +**Output**: `example/output/04-style-degree-control-demo.mp3` + +### Example 5: Text Substitution Demo + +Demonstrate the `substitutions` parameter for replacing technical terms. + +**Features**: +- W3C → World Wide Web Consortium +- HTTP → HyperText Transfer Protocol +- CEO → Chief Executive Officer + +**Output**: `example/output/05-text-substitution-demo.mp3` + +## API Parameters + +| Parameter | Required | Description | Default | +|-----------|----------|-------------|---------| +| `user_email` | ✅ | User email | - | +| `user_pass` | ✅ | User password | - | +| `type` | ❌ | `getSpeek`/`getBig`/`setBig` | `getSpeek` | +| `ssml` | ✅ | SSML content | - | +| `kbitrate` | ❌ | Audio quality | `audio-16khz-32kbitrate-mono-mp3` | +| `output_format` | ❌ | Return type: `binary`/`url` | `binary` | + +## Output Directory + +All generated audio files are saved in: +``` +example/output/ +├── 01-multi-speaker-dialogue-chained.mp3 +├── 02-multi-speaker-dialogue-functional.mp3 +├── 03-31-emotional-styles-demo.mp3 +├── 04-style-degree-control-demo.mp3 +└── 05-text-substitution-demo.mp3 +``` + +## Notes + +1. **Account Security**: `config.json` is ignored by `.gitignore` and will not be committed to Git +2. **Network Connection**: Running examples requires network connection to call the API +3. **Build Requirement**: You must run `pnpm run build` before running examples +4. **Node Version**: Requires Node.js 18+ (supports `fetch` API) + +## FAQ + +### Q: It says "config.json does not exist" +A: Please copy `config.example.json` to `config.json` and fill in your email and password + +### Q: Audio generation failed +A: Check network connection and verify that email and password are correct + +### Q: How to change audio quality? +A: Edit the `kbitrate` field in `config.json` diff --git a/example/config.example.json b/example/config.example.json new file mode 100644 index 0000000..1631e13 --- /dev/null +++ b/example/config.example.json @@ -0,0 +1,8 @@ +{ + "// 注意": "请复制此文件为 config.json 并填写您的邮箱和密码", + "user_email": "your-email@example.com", + "user_pass": "your-password", + "api_url": "https://ttspro.cn/getSpeek.php", + "kbitrate": "audio-16khz-32kbitrate-mono-mp3", + "output_format": "binary" +} diff --git a/example/run.sh b/example/run.sh new file mode 100755 index 0000000..e511d15 --- /dev/null +++ b/example/run.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Example run script +# Solve ts-node's inability to properly handle Chinese filenames + +# Check configuration file +if [ ! -f "config.json" ]; then + echo "❌ Error: config.json does not exist" + echo "📝 Please copy config.example.json to config.json and fill in email and password" + exit 1 +fi + +# Build project +echo "🔨 Building project..." +pnpm run build + +# Copy config.json to dist/example +echo "📋 Copying configuration file to output directory..." +cp config.json ../dist/example/ + +# Switch to dist/example directory to run examples +cd ../dist/example + + # Run example + case "$1" in + 0) + echo "🎙️ Running Example 0: Simple Dialogue Demo" + node "00-简单对话演示.js" + ;; + 1) + echo "🎙️ Running Example 1: Multi-Speaker Dialogue - Chained" + node "01-多说话人对话 - 链式调用.js" + ;; + 2) + echo "🎙️ Running Example 2: Multi-Speaker Dialogue - Functional" + node "02-多说话人对话 - 函数式.js" + ;; + 3) + echo "🎙️ Running Example 3: 31 Emotional Styles Demo" + node "03-31 种情感风格演示.js" + ;; + 4) + echo "🎙️ Running Example 4: Style Degree Control Demo" + node "04-情感强度控制演示.js" + ;; + 5) + echo "🎙️ Running Example 5: Text Substitution Demo" + node "05-文本替换功能演示.js" + ;; + *) + echo "Usage: ./run.sh " + echo "" + echo "Available examples:" + echo " 0 - Simple Dialogue Demo" + echo " 1 - Multi-Speaker Dialogue - Chained" + echo " 2 - Multi-Speaker Dialogue - Functional" + echo " 3 - 31 Emotional Styles Demo" + echo " 4 - Style Degree Control Demo" + echo " 5 - Text Substitution Demo" + exit 1 + ;; +esac diff --git a/src/AGENTS.md b/src/AGENTS.md new file mode 100644 index 0000000..e8fd790 --- /dev/null +++ b/src/AGENTS.md @@ -0,0 +1,145 @@ +# src/ Directory Knowledge Base + +**Module**: Core TTS Functionality Implementation + +--- + +## OVERVIEW + +MsEdgeTTS core source code directory - Contains all functionality implementations including WebSocket communication, SSML generation, audio output control, etc. + +--- + +## WHERE TO LOOK + +| Task | File | Description | +|------|------|------| +| Modify WebSocket communication logic | `MsEdgeTTS.ts` | Connection initialization, message exchange, boundary metadata processing | +| Add new audio format | `Output.ts` | `OUTPUT_FORMAT` enum + `OUTPUT_EXTENSIONS` mapping | +| Modify voice options | `Prosody.ts` | `ProsodyOptions` class (rate/pitch/volume) | +| Modify dialogue builder | `DialogueBuilder.ts` | Chained builder + `buildDialogueSSML()` function | +| Add SSML utilities | `SSMLUtils.ts` | Escape functions, emotional style validation | +| Modify type definitions | `DialogueTurn.ts` | `DialogueTurn`, `Dialogue`, `TextSegment`, `Substitution` | +| Add unit tests | `*.spec.ts` | Same directory as source, Jest config in package.json | + +--- + +## FILE STRUCTURE + +``` +src/ +├── index.ts # Barrel export (6 exports) +├── MsEdgeTTS.ts # Core class (457 lines) +├── MsEdgeTTS.spec.ts # Unit tests +├── Output.ts # OUTPUT_FORMAT enum + OUTPUT_EXTENSIONS +├── Prosody.ts # ProsodyOptions class + RATE/PITCH/VOLUME enums +├── DialogueTurn.ts # DialogueTurn/Dialogue/TextSegment/Substitution types +├── DialogueBuilder.ts # DialogueBuilder class + buildDialogueSSML() function +├── SSMLUtils.ts # escapeSSML/replaceText/validateStyle/validateStyleDegree +└── utils.ts # joinPath() path joining utility +``` + +--- + +## CODE MAP + +| Symbol | Type | File | Role | +|--------|------|------|------| +| `MsEdgeTTS` | Class | `MsEdgeTTS.ts` | Main class: WebSocket connection, speech synthesis, stream processing | +| `OUTPUT_FORMAT` | Enum | `Output.ts` | Supported audio formats (MP3/WEBM multiple bitrates) | +| `OUTPUT_EXTENSIONS` | Const | `Output.ts` | Format to file extension mapping (`.mp3`/`.webm`) | +| `ProsodyOptions` | Class | `Prosody.ts` | Rate/pitch/volume configuration options | +| `RATE` | Enum | `Prosody.ts` | Speaking rate presets (x-slow to x-fast) | +| `PITCH` | Enum | `Prosody.ts` | Pitch presets (x-low to x-high) | +| `VOLUME` | Enum | `Prosody.ts` | Volume presets (silent to x-LOUD) | +| `DialogueBuilder` | Class | `DialogueBuilder.ts` | Chained dialogue builder | +| `buildDialogueSSML` | Function | `DialogueBuilder.ts` | Functional SSML generation | +| `validateStyle` | Function | `SSMLUtils.ts` | Validate 28 official emotional styles | +| `escapeSSML` | Function | `SSMLUtils.ts` | XML escape (& < > " ') | + +--- + +## CONVENTIONS + +**TypeScript Configuration**: +- `module`: CommonJS (not ESM, for compatibility) +- `target`: ESNext +- `skipLibCheck`: true +- Compilation exclusion: `src/**/*.spec.ts` + +**Testing Conventions**: +- Test files in same directory as source: `*.spec.ts` +- Jest config inline in `package.json` +- Test timeout: 15000ms + +**Export Pattern**: +- Use barrel export (`index.ts` unified export) +- 6 public APIs: `MsEdgeTTS`, `OUTPUT_FORMAT`, `ProsodyOptions`, `DialogueTurn`, `DialogueBuilder`, `buildDialogueSSML` + +**SSML Processing**: +- Only supports `speak`/`voice`/`prosody`/`mstts:express-as`/`lang`/`sub` elements +- Full SSML specification not supported + +--- + +## ANTI-PATTERNS (SRC) + +- ❌ **Do NOT** use in browser - API requires Edge User-Agent (server-side only) +- ❌ **Do NOT** modify Sec-MS-GEC hash algorithm in `MsEdgeTTS.ts` - depends on Azure authentication mechanism +- ❌ **Do NOT** remove `isomorphic-ws` dependency - enables cross-environment compatibility +- ❌ **Do NOT** use callback API - Promise only + +--- + +## UNIQUE STYLES + +**WebSocket Communication**: +- Sec-MS-GEC hash authentication (SHA-256 + Windows Tick timestamp) +- Custom UUID generation (not `crypto.randomUUID`) +- Message delimiter: `\r\n\r\n` + +**Logging System**: +- Optional logger (`enableLogger` option) +- Only logs connection status, message exchange + +**Multi-Speaker Dialogue Support**: +- `DialogueBuilder` chained calls +- `buildDialogueSSML()` functional API +- Supports 28 emotional styles + intensity control (0.01-2.0) +- Supports text substitution (`` tags) +- Supports multi-language mixing (``) + +--- + +## COMMANDS + +```bash +# Compile src/ to dist/ +pnpm run build + +# Run tests (src/*.spec.ts) +pnpm test + +# Test watch mode +pnpm run test:watch + +# Test coverage +pnpm run test:cov +``` + +--- + +## NOTES + +**Key Limitations**: +- December 2025 update: API requires Edge User-Agent, **cannot be used in browsers** +- Voice list requires trusted client Token (hardcoded: `6A5AA1D4EAFF4E9FB37E23D68491D6F4`) + +**Known Issues**: +- `MsEdgeTTS.ts` approximately 457 lines - high complexity, recommended to split + +**Adding New Features Process**: +1. Create `.ts` file at same level in `src/` +2. Add export in `index.ts` +3. Create `.spec.ts` test file with same name +4. Run `pnpm test` to verify diff --git a/src/DialogueBuilder.ts b/src/DialogueBuilder.ts new file mode 100644 index 0000000..a271aee --- /dev/null +++ b/src/DialogueBuilder.ts @@ -0,0 +1,155 @@ +import { Dialogue, type DialogueTurn } from "./DialogueTurn"; +import { escapeSSML, replaceText, validateStyle, validateStyleDegree } from "./SSMLUtils"; + +/** + * Dialogue builder class for chain-building multi-speaker dialogues + */ +export class DialogueBuilder { + private turns: DialogueTurn[] = []; + + /** + * Create a dialogue builder + */ + constructor() {} + + /** + * Add a dialogue turn (chained call) + * @param turn - Dialogue turn object + * @returns Current builder instance (supports chained calls) + * @throws Throws an error when turn parameter is invalid + */ + addTurn(turn: DialogueTurn): DialogueBuilder { + // Strict mode validation + if (!turn.voice || turn.voice.trim() === "") { + throw new Error("voice name is required and cannot be empty"); + } + + if (turn.text !== undefined && turn.text !== null && turn.text.trim() === "") { + throw new Error("text cannot be empty string"); + } + + if (turn.style !== undefined && turn.style !== null) { + validateStyle(turn.style); + } + + if (turn.styleDegree !== undefined && turn.styleDegree !== null) { + validateStyleDegree(turn.styleDegree); + } + + this.turns.push(turn); + return this; + } + + /** + * Build a Dialogue object + * @returns Dialogue object containing all added turns + */ + build(): Dialogue { + const dialogue = new Dialogue(); + dialogue.turns = [...this.turns]; + return dialogue; + } + + /** + * Reset builder state, clearing all added turns + * @returns Current builder instance (supports chained calls) + */ + reset(): DialogueBuilder { + this.turns = []; + return this; + } +} + +/** + * Build SSML string for multi-speaker dialogue + * @param turns - Array of dialogue turns + * @returns Complete SSML string + */ +export function buildDialogueSSML(turns: DialogueTurn[]): string { + const voiceElements: string[] = []; + + for (const turn of turns) { + // Process text: apply substitutions first, then SSML escaping + let processedText = turn.text || ""; + + // Apply text substitution (generate tags) + if (turn.substitutions && turn.substitutions.length > 0) { + // Sort by text length descending to ensure longer words are replaced first + const sortedSubs = [...turn.substitutions].sort((a, b) => b.text.length - a.text.length); + const placeholders: Map = new Map(); + + for (let i = 0; i < sortedSubs.length; i++) { + const sub = sortedSubs[i]; + // First escape alias and text for SSML + const escapedAlias = escapeSSML(sub.alias); + const escapedText = escapeSSML(sub.text); + // Generate text tag + const subTag = `${escapedText}`; + // Use unique placeholder + const placeholder = `__SUB_PLACEHOLDER_${i}__`; + placeholders.set(placeholder, subTag); + // First replace with placeholder + processedText = processedText.replace( + new RegExp(sub.text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), "g"), + placeholder + ); + } + + // Apply SSML escaping + processedText = escapeSSML(processedText); + + // Restore tags + for (const [placeholder, subTag] of placeholders.entries()) { + processedText = processedText.replace(placeholder, subTag); + } + } else { + // When no substitutions, apply SSML escaping directly + processedText = escapeSSML(processedText); + } + + // Process children (if any) + let childrenContent = ""; + if (turn.children && turn.children.length > 0) { + childrenContent = turn.children + .map((segment) => { + let segmentText = escapeSSML(segment.text); + if (segment.substitution) { + segmentText = segment.substitution; + } + if (segment.lang) { + return `${segmentText}`; + } + return segmentText; + }) + .join(""); + } + + // Build voice element content + let voiceContent = childrenContent || processedText; + + // Apply lang (if any) + if (turn.lang) { + voiceContent = `${voiceContent}`; + } + + // Apply style and styleDegree (if any) + if (turn.style) { + const styleDegreeAttr = turn.styleDegree !== undefined && turn.styleDegree !== null + ? ` styledegree="${turn.styleDegree}"` + : ""; + voiceContent = `${voiceContent}`; + } + + // Build complete voice element + voiceElements.push(`${voiceContent}`); + } + + // Infer primary language (based on first voice name) + const firstVoice = turns[0]?.voice || "zh-CN-XiaoxiaoNeural"; + const lang = firstVoice.split("-").slice(0, 2).join("-"); // Extract "zh-CN" or "en-US" + + // Build complete SSML + return ` +${voiceElements.join("\n")} +`; +} diff --git a/src/DialogueTurn.ts b/src/DialogueTurn.ts new file mode 100644 index 0000000..7497794 --- /dev/null +++ b/src/DialogueTurn.ts @@ -0,0 +1,45 @@ +/** + * Text substitution interface for replacing specific strings in text with aliases + */ +export interface Substitution { + text: string; + alias: string; +} + +/** + * Text segment interface supporting language specification and text substitution + */ +export interface TextSegment { + text: string; + lang?: string; + substitution?: string; +} + +/** + * Dialogue turn interface defining voice parameters and text content for a single speaker + */ +export interface DialogueTurn { + speaker?: string; + voice: string; + text?: string; + children?: TextSegment[]; + style?: string; + styleDegree?: number; + lang?: string; + substitutions?: Substitution[]; +} + +/** + * Dialogue class containing multiple dialogue turns and convertible to SSML + */ +export class Dialogue { + turns: DialogueTurn[] = []; + + /** + * Convert dialogue to SSML format + * @returns SSML string (placeholder implementation, will be completed in subsequent tasks) + */ + toSSML(): string { + return ""; + } +} diff --git a/src/MsEdgeTTS.ts b/src/MsEdgeTTS.ts index 5f86370..b756c2c 100644 --- a/src/MsEdgeTTS.ts +++ b/src/MsEdgeTTS.ts @@ -7,6 +7,8 @@ import * as fs from "fs" import {Agent} from "http" import {ProsodyOptions} from "./Prosody" import {joinPath} from "./utils"; +import { Dialogue, DialogueTurn } from "./DialogueTurn"; +import { buildDialogueSSML } from "./DialogueBuilder"; export type Voice = { Name: string; @@ -307,6 +309,27 @@ export class MsEdgeTTS { return this._rawSSMLRequestToFile(dirPath, this._SSMLTemplate(input, options)) } + /** + * Writes raw audio synthesised from dialogue to a file. Supports multi-speaker conversations. + * + * @param dirPath a valid output directory path + * @param dialogue a {@link Dialogue} object or an array of {@link DialogueTurn} objects + * @param options (optional) {@link ProsodyOptions} - Note: prosody options are applied globally and may conflict with per-turn settings in dialogue + @returns {Promise<{audioFilePath: string, metadataFilePath: string | null}>} - a `Promise` with the full filepaths + */ + toFileDialogue(dirPath: string, dialogue: Dialogue | DialogueTurn[], options?: ProsodyOptions): Promise<{ + audioFilePath: string, + metadataFilePath: string | null + }> { + let ssml: string; + if (dialogue instanceof Dialogue) { + ssml = dialogue.toSSML(); + } else { + ssml = buildDialogueSSML(dialogue); + } + return this.rawToFile(dirPath, ssml); + } + /** * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. * @@ -321,6 +344,25 @@ export class MsEdgeTTS { return this._rawSSMLRequest(this._SSMLTemplate(input, options)) } + /** + * Writes raw audio synthesised from dialogue in real-time to a {@link Readable}. Supports multi-speaker conversations. + * + * @param dialogue a {@link Dialogue} object or an array of {@link DialogueTurn} objects + @returns {Promise<{audioStream: Readable, metadataStream: Readable | null}>} - a `Promise` with the streams + */ + toStreamDialogue(dialogue: Dialogue | DialogueTurn[]): { + audioStream: Readable, + metadataStream: Readable | null, + } { + let ssml: string; + if (dialogue instanceof Dialogue) { + ssml = dialogue.toSSML(); + } else { + ssml = buildDialogueSSML(dialogue); + } + return this.rawToStream(ssml); + } + /** * Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request. * diff --git a/src/SSMLUtils.ts b/src/SSMLUtils.ts new file mode 100644 index 0000000..128f631 --- /dev/null +++ b/src/SSMLUtils.ts @@ -0,0 +1,82 @@ +import type { Substitution } from "./DialogueTurn"; + +/** + * Escape SSML special characters + * Escape order: & first, then others to prevent double escaping + */ +export function escapeSSML(text: string): string { + return text + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); +} + +/** + * Replace matches in text sequentially (single pass, non-recursive) + */ +export function replaceText(text: string, substitutions: Substitution[]): string { + let result = text; + for (const sub of substitutions) { + result = result.replace(new RegExp(sub.text, "g"), sub.alias); + } + return result; +} + +/** + * Officially supported 28 emotional styles by Microsoft Azure Speech Service + */ +const VALID_STYLES = [ + "advertisement_upbeat", + "affectionate", + "angry", + "assistant", + "calm", + "chat", + "cheerful", + "customerservice", + "depressed", + "documentary-narration", + "empathetic", + "excited", + "fearful", + "friendly", + "gentle", + "hopeful", + "lyrical", + "narration-professional", + "narration-relaxed", + "newscast", + "newscast-casual", + "newscast-formal", + "poetry-reading", + "sad", + "serious", + "shouting", + "sports_commentary", + "sports_commentary_excited", + "terrified", + "unfriendly", + "whispering", +] as const; + +/** + * Validate if style is a valid Microsoft official emotional style + * Throws Error if invalid + */ +export function validateStyle(style: string): void { + if (!VALID_STYLES.includes(style as any)) { + throw new Error(`Invalid style "${style}". Valid styles: ${VALID_STYLES.join(", ")}`); + } +} + +/** + * Validate styleDegree range (0.01-2.0) + * Throws Error if invalid + */ +export function validateStyleDegree(degree: number): void { + if (degree < 0.01 || degree > 2.0) { + throw new Error("styleDegree must be between 0.01 and 2.0"); + } +} diff --git a/src/index.ts b/src/index.ts index a46a6ee..9e2cde2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,3 +1,6 @@ export * from "./MsEdgeTTS" export * from "./Output" -export * from "./Prosody" \ No newline at end of file +export * from "./Prosody" +export * from "./DialogueTurn" +export * from "./DialogueBuilder" +export * from "./SSMLUtils" \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json index 08ba2c1..9926d11 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,6 +1,7 @@ { "include": [ - "src/**/*" + "src/**/*", + "example/**/*.ts" ], "exclude": [ "node_modules",