From 6d6ae6fd68ac4015486635afc9e9a61b894a658e Mon Sep 17 00:00:00 2001 From: chdeskur Date: Sat, 27 Jul 2024 19:13:02 -0400 Subject: [PATCH] update spec --- fern/docs.yml | 7 +- fern/openapi/openapi-overrides.yml | 35 +-- fern/openapi/openapi.yml | 421 ++++------------------------- 3 files changed, 73 insertions(+), 390 deletions(-) diff --git a/fern/docs.yml b/fern/docs.yml index 270d623..c7c02a8 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -104,7 +104,7 @@ navigation: - page: Stream Speech (WebSocket) path: api-reference/endpoints/stream-speech-websocket.mdx icon: plug - - POST /tts/bytes + - POST /tts/clip - POST /tts/sse - section: REST contents: @@ -112,8 +112,7 @@ navigation: - POST /voices - GET /voices/{id} - DELETE /voices/{id} + - GET /voices/{id}/embedding - POST /voices/clone/clip - - section: Webhooks - contents: - - GET / + - POST /voices/clone/url - tab: changelog diff --git a/fern/openapi/openapi-overrides.yml b/fern/openapi/openapi-overrides.yml index bbf4b17..14f0cd4 100644 --- a/fern/openapi/openapi-overrides.yml +++ b/fern/openapi/openapi-overrides.yml @@ -2,39 +2,28 @@ servers: - url: "https://api.cartesia.ai" x-fern-server-name: Production paths: - /tts/bytes: - post: - x-fern-streaming: true - x-fern-sdk-method-name: streamSpeechBytes - /tts/sse: - post: - x-fern-streaming: true - x-fern-sdk-method-name: streamSpeechServerSentEvents - /voices: - get: - x-fern-sdk-method-name: listVoices - post: - x-fern-sdk-method-name: createVoice - /voices/{id}: - get: - x-fern-sdk-method-name: getVoice - delete: - x-fern-sdk-method-name: deleteVoice - /voices/clone/clip: - post: - x-fern-sdk-method-name: clone_voice_from_clip + # /tts/clip: + # post: + # x-fern-streaming: true + # /tts/sse: + # post: + # x-fern-streaming: true /: get: x-fern-ignore: true - "/": + "": get: + operationId: apiStatusAndVersion servers: - url: "wss://api.cartesia.ai" x-name: WebSocket - x-fern-sdk-method-name: apiStatusAndVersion + # x-fern-webhook: true security: [] summary: API Status and Version description: Returns the server's version and a status message which can be useful for sanity checking. + requestBody: + content: + application/json: {} responses: "200": description: Everything OK. diff --git a/fern/openapi/openapi.yml b/fern/openapi/openapi.yml index f9d2b0c..18bcbfe 100644 --- a/fern/openapi/openapi.yml +++ b/fern/openapi/openapi.yml @@ -2,7 +2,7 @@ openapi: 3.0.0 info: title: Cartesia REST API - version: "2024-06-10" + version: 2024-06-10 servers: - url: "https://api.cartesia.ai" @@ -11,13 +11,11 @@ security: - APIKeyHeader: [] paths: - "/tts/bytes": + "/tts/clip": post: - summary: Stream Speech (Bytes) + summary: Stream Speech (Clip) description: |- - Generate audio from a transcript using a given voice and model. The audio is streamed out as raw bytes. - parameters: - - $ref: "#/components/parameters/Cartesia-Version" + Generate audio from a transcript using a given voice and model. The audio is returned as a binary clip, i.e. this endpoint streams out raw bytes. requestBody: $ref: "#/components/requestBodies/TTSRequest" responses: @@ -34,10 +32,6 @@ paths: "/tts/sse": post: summary: Stream Speech (Server-Sent Events) - description: |- - Generate audio from a transcript using a given voice and model. The audio is streamed out as Server-Sent Events. - parameters: - - $ref: "#/components/parameters/Cartesia-Version" requestBody: $ref: "#/components/requestBodies/TTSRequest" responses: @@ -49,20 +43,32 @@ paths: oneOf: - $ref: "#/components/schemas/JSONChunkResponse" - $ref: "#/components/schemas/JSONDoneResponse" - - $ref: "#/components/schemas/JSONTimestampResponse" - $ref: "#/components/schemas/JSONErrorResponse" discriminator: propertyName: status_code default: $ref: "#/components/responses/Error" + "/voices/{id}/embedding": + get: + summary: Get Voice Embedding + parameters: + - in: path + name: id + required: true + schema: + "$ref": "#/components/schemas/VoiceId" + responses: + "200": + $ref: "#/components/responses/VoiceEmbeddingResponse" + default: + $ref: "#/components/responses/Error" + "/voices": get: summary: List Voices description: |- List all voices available to the user, that is, public voices and the user's own voices. - parameters: - - $ref: "#/components/parameters/Cartesia-Version" responses: "200": description: List of voices. @@ -77,8 +83,6 @@ paths: summary: Create Voice description: |- Create a new voice with a given name, description, and embedding. - parameters: - - $ref: "#/components/parameters/Cartesia-Version" requestBody: required: true content: @@ -110,7 +114,6 @@ paths: get: summary: Get Voice parameters: - - $ref: "#/components/parameters/Cartesia-Version" - in: path name: id required: true @@ -128,7 +131,6 @@ paths: delete: summary: Delete Voice parameters: - - $ref: "#/components/parameters/Cartesia-Version" - in: path name: id required: true @@ -146,8 +148,6 @@ paths: description: |- Clones a voice from an audio clip uploaded as a file. The clip is uploaded using multipart/form-data with a `clip` field containing the audio file. operationId: clone_voice_from_clip - parameters: - - $ref: "#/components/parameters/Cartesia-Version" requestBody: required: true content: @@ -164,9 +164,28 @@ paths: default: $ref: "#/components/responses/Error" + "/voices/clone/url": + post: + summary: Clone Voice (URL) + description: |- + Clone a voice from an online URL. Currently only supports YouTube. + parameters: + - required: true + schema: + type: string + title: Link + description: The URL of the video to clone the voice from. + example: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: link + in: query + responses: + "200": + $ref: "#/components/responses/VoiceEmbeddingResponse" + default: + $ref: "#/components/responses/Error" + "/": get: - security: [] summary: API Status and Version description: Returns the server's version and a status message which can be useful for sanity checking. responses: @@ -193,16 +212,6 @@ components: in: header name: X-API-Key - parameters: - Cartesia-Version: - in: header - name: Cartesia-Version - description: The version of the Cartesia API to use. - required: true - schema: - type: string - example: "2024-06-10" - requestBodies: TTSRequest: required: true @@ -220,7 +229,7 @@ components: model_id: title: Model ID type: string - example: sonic-english + example: upbeat-moon transcript: title: Transcript type: string @@ -234,293 +243,35 @@ components: description: The voice to use for the speech. Can be either an ID or an embedding, specified by the `mode` field. oneOf: - type: object - title: Voice ID properties: mode: type: string enum: [id] id: "$ref": "#/components/schemas/VoiceId" - __experimental_controls: - type: object - properties: - speed: - title: Speed - description: Controls the speed of the voice. - type: string - enum: [slowest, slow, normal, fast, fastest] - emotion: - title: Emotion - description: Controls the amount of emotion to add to the voice. Accepts an array of `emotion:level` tags. - type: array - items: - type: string - pattern: '^(anger|positivity|surprise|sadness|curiosity):(lowest|low|high|highest)$' - example: - mode: "id" - id: &voiceID "a0e99841-438c-4a64-b679-ae501e7d6091" - __experimental_controls: - { - speed: "normal", - emotion: ["positivity:high", "curiosity"] - } - type: object - title: Voice Embedding properties: mode: type: string enum: [embedding] embedding: "$ref": "#/components/schemas/VoiceEmbedding" - __experimental_controls: - type: object - properties: - speed: - title: Speed - description: Controls the speed of the voice. - type: string - enum: [slowest, slow, normal, fast, fastest] - emotion: - title: Emotion - description: Controls the amount of emotion to add to the voice. Accepts an array of `emotion:level` tags. - type: array - items: - type: string - pattern: '^(anger|positivity|surprise|sadness|curiosity):(lowest|low|high|highest)$' - example: - mode: "embedding" - embedding: - &voiceEmbedding [ - -0.033633083, - 0.072083704, - -0.01807767, - -0.083488315, - -0.04407617, - 0.0022592682, - 0.070505895, - 0.023946615, - -0.04788024, - -0.06388413, - -0.0716355, - -0.0022612812, - -0.0053448505, - -0.07848381, - 0.0348162, - -0.053745482, - -0.092399485, - -0.02950225, - 0.028591828, - -0.10556894, - 0.023313355, - 0.06224387, - 0.0362463, - 0.029258432, - 0.10769641, - 0.043595582, - -0.058543224, - -0.080402784, - -0.0953816, - -0.008988032, - -0.0028981369, - -0.004752721, - -0.20742874, - 0.058907595, - 0.08813939, - -0.06192675, - 0.099082634, - -0.09661578, - -0.0077761724, - -0.013982456, - -0.025798267, - 0.04467142, - 0.026222011, - 0.023023574, - 0.011227064, - -0.17462021, - -0.09880612, - -0.1521035, - -0.060464993, - -0.04735665, - -0.09725187, - -0.006127679, - 0.15818526, - -0.039493002, - -0.067719474, - 0.0066190436, - -0.10636633, - 0.17073768, - -0.051717706, - 0.03186961, - -0.020547207, - -0.02244247, - 0.013196935, - -0.06431055, - -0.115360335, - 0.016918058, - -0.033195216, - 0.11255181, - 0.020366343, - -0.041032124, - 0.08780918, - -0.040567942, - 0.057276532, - 0.05848221, - -0.077479474, - -0.073524915, - -0.01913317, - -0.029291833, - 0.11210393, - -0.09859328, - 0.2152541, - -0.022976823, - 0.028627992, - -0.039598297, - 0.041829932, - -0.05593181, - -0.06444655, - -0.018057477, - -0.008098263, - 0.05994528, - 0.10430693, - -0.13121894, - -0.06512868, - -0.026126215, - 0.046727825, - -0.17180993, - -0.10577226, - -0.08610466, - 0.008862588, - 0.09547498, - -0.010965332, - -0.061217085, - -0.038954042, - 0.019930292, - -0.017192135, - 0.007296275, - 0.03273872, - 0.04389937, - -0.056483064, - 0.003420891, - -0.10319067, - -0.015706042, - 0.1308774, - -0.0018035866, - -0.03582506, - 0.077131025, - 0.013398928, - 0.003188886, - 0.12039741, - -0.033974767, - 0.06899378, - -0.059775922, - -0.026934423, - 0.028482193, - 0.100996524, - 0.004498743, - -0.02291186, - 0.078752205, - -0.0063796206, - 0.04206536, - 0.05721349, - 0.06290694, - 0.06130212, - 0.096969016, - -0.057664312, - -0.16727506, - -0.035220966, - 0.090760484, - 0.010039947, - 0.06513242, - 0.011055657, - -0.004258431, - -0.08316792, - -0.15650468, - -0.076931365, - 0.11385587, - -0.038372636, - 0.015648656, - -0.12029895, - -0.06604956, - 0.009441591, - -0.11912808, - 0.013378132, - 0.029525978, - -0.0056742397, - -0.0075976513, - 0.019999338, - -0.05521377, - -0.07650746, - -0.017710293, - -0.033986397, - -0.047768556, - 0.13857274, - 0.099290825, - 0.11736938, - 0.017834296, - -0.07140237, - -0.052047748, - -0.06398965, - -0.037033975, - -0.061061256, - -0.03330076, - -0.024472248, - -0.059656, - 0.05359946, - -0.043915518, - -0.086325996, - 0.14189173, - 0.021086395, - 0.02945159, - 0.1029604, - 0.018490415, - -0.028736332, - -0.025272416, - -0.06082937, - -0.031339463, - -0.0007249595, - 0.025595888, - 0.007144545, - -0.16938712, - -0.1160664, - -0.0654145, - ] - __experimental_controls: - { - speed: "normal", - emotion: ["positivity:high", "curiosity"] - } discriminator: propertyName: mode output_format: title: Output Format - description: The format of the audio data in the response. WAV is only supported on the `/tts/bytes` endpoint. - discriminator: - propertyName: container - oneOf: - - type: object - title: Raw/WAV - required: [container, encoding, sample_rate] - properties: - container: - type: string - enum: ["raw", "wav"] - encoding: - type: string - enum: - ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"] - sample_rate: - type: integer - enum: [8000, 16000, 22050, 24000, 44100] - language: - title: Language - type: string - enum: ["en", "es", "fr", "de", "pt", "zh", "ja"] - example: "en" - add_timestamps: - title: Add Timestamps - type: boolean - description: Whether to add timestamps to the audio. This is only supported on `tts/sse` and WebSocket endpoints. - example: false + type: object + required: [container, encoding, sample_rate] + properties: + container: + type: string + enum: ["raw"] + encoding: + type: string + enum: ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"] + sample_rate: + type: integer + enum: [8000, 16000, 22050, 24000, 44100] responses: Error: @@ -581,17 +332,16 @@ components: type: string example: "A deep, rich, male voice with an Indian accent." - VoiceId: - title: Voice ID - type: string - example: *voiceID - VoiceEmbedding: title: Voice Embedding items: type: number type: array - example: *voiceEmbedding + + VoiceId: + title: Voice ID + type: string + example: "a0e99841-438c-4a64-b679-ae501e7d6091" VoiceName: title: VoiceName @@ -615,10 +365,6 @@ components: title: Done type: boolean description: Whether the model has finished generating audio. - type: - title: Type - type: string - description: The type of response. JSONChunkResponse: title: JSON Chunk Response @@ -633,9 +379,6 @@ components: done: <<: *done enum: [false] - type: - title: Type - enum: [chunk] data: title: Data type: string @@ -646,48 +389,6 @@ components: type: integer description: The time in milliseconds that the model took to generate this chunk of audio. - JSONTimestampResponse: - title: JSON Timestamp Response - type: object - allOf: - - "$ref": "#/components/schemas/JSONBaseResponse" - - type: object - properties: - status_code: - <<: *statusCode - enum: [206] - done: - <<: *done - enum: [false] - type: - title: Type - enum: [timestamps] - word_timestamps: - title: Word Timestamps - type: object - description: Object containing the timestamps for each word in the transcript. - properties: - words: - title: Words - type: array - items: - type: string - description: The individual words in the transcript. - start: - title: Start - type: array - items: - type: number - format: float - description: The starting timestamp for each word (in seconds). - end: - title: End - type: array - items: - type: number - format: float - description: The ending timestamp for each word (in seconds). - JSONDoneResponse: title: JSON Done Response type: object @@ -701,9 +402,6 @@ components: done: <<: *done enum: [true] - type: - title: Type - enum: [done] JSONErrorResponse: title: JSON Error Response @@ -719,10 +417,7 @@ components: done: <<: *done enum: [true] - type: - title: Type - enum: [error] error: title: Error type: string - description: A human-readable error message. + description: A human-readable error message. \ No newline at end of file