forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ElevenLabs text to speech tool (langchain-ai#10525)
- Loading branch information
Showing
7 changed files
with
328 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "a991a6f8-1897-4f49-a191-ae3bdaeda856", | ||
"metadata": {}, | ||
"source": [ | ||
"# Eleven Labs Text2Speech\n", | ||
"\n", | ||
"This notebook shows how to interact with the `ElevenLabs API` to achieve text-to-speech capabilities." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "9eeb311e-e1bd-4959-8536-4d267f302eb3", | ||
"metadata": {}, | ||
"source": [ | ||
"First, you need to set up an ElevenLabs account. You can follow the instructions [here](https://docs.elevenlabs.io/welcome/introduction)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "0a309c0e-5310-4eaa-8af9-bcbc252e45da", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# !pip install elevenlabs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f097c3b1-f761-43cb-aad0-8ba2e93e5f5f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"\n", | ||
"os.environ[\"ELEVEN_API_KEY\"] = \"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "434b2454-2bff-484d-822c-4026a9dc1383", | ||
"metadata": {}, | ||
"source": [ | ||
"## Usage" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "2f57a647-9214-4562-a8cf-f263a15d1f40", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'eleven_labs_text2speech'" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"from langchain.tools import ElevenLabsText2SpeechTool\n", | ||
"\n", | ||
"text_to_speak = \"Hello world! I am the real slim shady\"\n", | ||
"\n", | ||
"tts = ElevenLabsText2SpeechTool()\n", | ||
"tts.name" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d4613fed-66f0-47c6-be50-7e7670654427", | ||
"metadata": {}, | ||
"source": [ | ||
"We can generate audio, save it to the temporary file and then play it." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "f1984844-aa75-4f83-9d42-1c8052d87cc0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"speech_file = tts.run(text_to_speak)\n", | ||
"tts.play(speech_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "42d89cd4-ac2a-4857-9787-c9018b4a8782", | ||
"metadata": {}, | ||
"source": [ | ||
"Or stream audio directly." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tts.stream_speech(text_to_speak)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "a152766d-5f06-48b1-ac89-b4e8d88d3c9f", | ||
"metadata": {}, | ||
"source": [ | ||
"## Use within an Agent" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"id": "37626aea-0cf0-4849-9c00-c0f40515ffe0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain import OpenAI\n", | ||
"from langchain.agents import initialize_agent, AgentType, load_tools" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"llm = OpenAI(temperature=0)\n", | ||
"tools = load_tools([\"eleven_labs_text2speech\"])\n", | ||
"agent = initialize_agent(\n", | ||
" tools=tools,\n", | ||
" llm=llm,\n", | ||
" agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n", | ||
" verbose=True,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"id": "336bf95a-3ccb-4963-aac3-638a4df2ed78", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"\n", | ||
"\n", | ||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", | ||
"\u001b[32;1m\u001b[1;3mAction:\n", | ||
"```\n", | ||
"{\n", | ||
" \"action\": \"eleven_labs_text2speech\",\n", | ||
" \"action_input\": {\n", | ||
" \"query\": \"Why did the chicken cross the playground? To get to the other slide!\"\n", | ||
" }\n", | ||
"}\n", | ||
"```\n", | ||
"\n", | ||
"\u001b[0m\n", | ||
"Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n", | ||
"Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n", | ||
"Action:\n", | ||
"```\n", | ||
"{\n", | ||
" \"action\": \"Final Answer\",\n", | ||
" \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n", | ||
"}\n", | ||
"```\n", | ||
"\n", | ||
"\u001b[0m\n", | ||
"\n", | ||
"\u001b[1m> Finished chain.\u001b[0m\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"audio_file = agent.run(\"Tell me a joke and read it out for me.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"id": "f0aa7aa9-4682-4599-8cae-59347d9e5210", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tts.play(audio_file)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
"""Eleven Labs Services Tools.""" | ||
|
||
from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool | ||
|
||
__all__ = ["ElevenLabsText2SpeechTool"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from enum import Enum | ||
|
||
|
||
class ElevenLabsModel(str, Enum): | ||
"""Models available for Eleven Labs Text2Speech.""" | ||
|
||
MULTI_LINGUAL = "eleven_multilingual_v1" | ||
MONO_LINGUAL = "eleven_monolingual_v1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import tempfile | ||
from enum import Enum | ||
from typing import Any, Dict, Optional, Union | ||
|
||
from langchain.callbacks.manager import CallbackManagerForToolRun | ||
from langchain.pydantic_v1 import root_validator | ||
from langchain.tools.base import BaseTool | ||
from langchain.utils import get_from_dict_or_env | ||
|
||
|
||
def _import_elevenlabs() -> Any: | ||
try: | ||
import elevenlabs | ||
except ImportError as e: | ||
raise ImportError( | ||
"Cannot import elevenlabs, please install `pip install elevenlabs`." | ||
) from e | ||
return elevenlabs | ||
|
||
|
||
class ElevenLabsModel(str, Enum): | ||
"""Models available for Eleven Labs Text2Speech.""" | ||
|
||
MULTI_LINGUAL = "eleven_multilingual_v1" | ||
MONO_LINGUAL = "eleven_monolingual_v1" | ||
|
||
|
||
class ElevenLabsText2SpeechTool(BaseTool): | ||
"""Tool that queries the Eleven Labs Text2Speech API. | ||
In order to set this up, follow instructions at: | ||
https://docs.elevenlabs.io/welcome/introduction | ||
""" | ||
|
||
model: Union[ElevenLabsModel, str] = ElevenLabsModel.MULTI_LINGUAL | ||
|
||
name: str = "eleven_labs_text2speech" | ||
description: str = ( | ||
"A wrapper around Eleven Labs Text2Speech. " | ||
"Useful for when you need to convert text to speech. " | ||
"It supports multiple languages, including English, German, Polish, " | ||
"Spanish, Italian, French, Portuguese, and Hindi. " | ||
) | ||
|
||
@root_validator(pre=True) | ||
def validate_environment(cls, values: Dict) -> Dict: | ||
"""Validate that api key exists in environment.""" | ||
_ = get_from_dict_or_env(values, "eleven_api_key", "ELEVEN_API_KEY") | ||
|
||
return values | ||
|
||
def _run( | ||
self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None | ||
) -> str: | ||
"""Use the tool.""" | ||
elevenlabs = _import_elevenlabs() | ||
try: | ||
speech = elevenlabs.generate(text=query, model=self.model) | ||
with tempfile.NamedTemporaryFile( | ||
mode="bx", suffix=".wav", delete=False | ||
) as f: | ||
f.write(speech) | ||
return f.name | ||
except Exception as e: | ||
raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}") | ||
|
||
def play(self, speech_file: str) -> None: | ||
"""Play the text as speech.""" | ||
elevenlabs = _import_elevenlabs() | ||
with open(speech_file, mode="rb") as f: | ||
speech = f.read() | ||
|
||
elevenlabs.play(speech) | ||
|
||
def stream_speech(self, query: str) -> None: | ||
"""Stream the text as speech as it is generated. | ||
Play the text in your speakers.""" | ||
elevenlabs = _import_elevenlabs() | ||
speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True) | ||
elevenlabs.stream(speech_stream) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters