From f5947f03d9cfb0a400227f4d501166db8c608bf6 Mon Sep 17 00:00:00 2001 From: pluja Date: Fri, 28 Apr 2023 15:48:52 +0200 Subject: [PATCH] Use piper for realistic TTS, remove google tts --- .gitignore | 9 ++++- Dockerfile | 12 +++++-- README.md | 4 +-- entrypoint.sh | 7 ++++ example.env | 11 +++--- main.py | 81 +++++++++++++++++++++------------------------ piper/init-piper.sh | 81 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- utils.py | 20 +++++++++++ 9 files changed, 174 insertions(+), 54 deletions(-) create mode 100644 entrypoint.sh create mode 100755 piper/init-piper.sh create mode 100644 utils.py diff --git a/.gitignore b/.gitignore index 0e7f7e4..822b06e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,11 @@ venv/ .env *.db -__pycache__/ \ No newline at end of file +__pycache__/ +db_data +*.ogg +piper/*.so* +piper/piper +piper/espeak* +piper/voices +MODEL_CARD \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index eb177e4..adc8505 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,16 @@ FROM python:3.10-slim -RUN apt update && apt install -y ffmpeg libespeak1 +# Set the voice language +ARG VOICE_LANGUAGE=en + +RUN apt update && apt install -y ffmpeg wget libespeak1 WORKDIR /app +COPY ./entrypoint.sh /app +RUN chmod +x /app/entrypoint.sh + +COPY ./piper /app/piper + COPY ./main.py /app COPY ./database.py /app COPY ./requirements.txt /app @@ -11,4 +19,4 @@ RUN mkdir db_data RUN pip install --upgrade pip RUN pip install -r requirements.txt -CMD [ "python3", "/app/main.py" ] \ No newline at end of file +ENTRYPOINT [ "/app/entrypoint.sh" ] \ No newline at end of file diff --git a/README.md b/README.md index 1705fd4..7091235 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A telegram bot to interact with OpenAI API. You can: - Voice chat with ChatGPT: - Send voice message. - Receive voice messages. - - Use GoogleTTS or 100% local Espeak (more robotic). + - Use 100% local Text-To-Speech with Language Recognition to give ChatGPT a voice in many languages! Other features include: @@ -53,7 +53,7 @@ Self hosting this chatbot is pretty easy. You just need to follow this steps: - Set your ALLOWED_USERS (comma separated user ids). Set it to `*` to allow all users. - Set the SYSTEM_PROMPT for ChatGPT. This is always instructed to ChatGPT as the system. - Optional: Edit the MAX_CONTEXT. This variable sets the number of messages that will be sent to ChatGPT API as context for the conversation. - - WHISPER_TO_CHAT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not. + - WHISPER_TO_GPT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not. - You can also configure this using `/settings` in chat. - ENABLE_GOOGLE_TTS the TTS service will be provided by GoogleTTS, producing more natural voices. If disabled, it fallsback to local voice generation using Espeak. - VOICE_LANGUAGE country code for the default voice accent. diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100644 index 0000000..c63d2db --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Installing piper for text to voice conversion..." +bash /app/piper/init-piper.sh + +echo "Bot starting..." +python3 -u /app/main.py \ No newline at end of file diff --git a/example.env b/example.env index 8dd7f99..3e7e363 100644 --- a/example.env +++ b/example.env @@ -6,10 +6,13 @@ CHATGPT_MAX_USER_CONTEXT=5 CHATGPT_TEMPERATURE=1.0 # Use Whisper transcript from voice message with ChatGPT -WHISPER_TO_CHAT=1 -# Use Google TTS for speech to text -ENABLE_GOOGLE_TTS=0 -VOICE_LANGUAGE=en # en, es, fr, de, it, pt, ru, ja, ko +WHISPER_TO_GPT=1 + +# TTS Options +ENABLE_TTS=1 +# If USE_TTS=1, you can set the following options +VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca +DEFAULT_VOICE_LANGUAGE=en BOT_TOKEN=your-telegram-bot-token BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs diff --git a/main.py b/main.py index b9c0282..7910a74 100644 --- a/main.py +++ b/main.py @@ -1,19 +1,19 @@ -import asyncio import logging import os import tempfile from functools import wraps from io import BytesIO +import subprocess + import openai -import pyttsx3 from aiogram import Bot, Dispatcher, types from aiogram.contrib.middlewares.logging import LoggingMiddleware from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode from aiogram.utils import executor from dotenv import load_dotenv -from gtts import gTTS from pydub import AudioSegment +from langdetect import detect import database @@ -39,9 +39,13 @@ ALLOWED_USERS = os.environ.get("BOT_ALLOWED_USERS").split(",") SYSTEM_PROMPT = os.environ.get("CHATGPT_SYSTEM_PROMPT") TEMPERATURE = os.environ.get("CHATGPT_TEMPERATURE") MODEL = os.environ.get("OPENAI_MODEL") -WHISPER_TO_CHAT = bool(int(os.environ.get("WHISPER_TO_CHAT"))) -ENABLE_GOOGLE_TTS = bool(int(os.environ.get("ENABLE_GOOGLE_TTS"))) -VOICE_LANGUAGE = os.environ.get("VOICE_LANGUAGE") +WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT"))) + +# TTS Settings +ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS"))) +DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE") +VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST") + MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT")) openai.api_key = os.environ.get("OPENAI_API_KEY") @@ -52,7 +56,7 @@ async def getUserData(chat_id): "context": [], "usage": {"chatgpt": 0, "whisper": 0, "dalle": 0}, "options": { - "whisper_to_chat": WHISPER_TO_CHAT, + "whisper_to_chat": WHISPER_TO_GPT, "assistant_voice_chat": False, "temperature": float(TEMPERATURE), "max-context": MAX_USER_CONTEXT @@ -83,43 +87,34 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup: ] return InlineKeyboardMarkup(inline_keyboard=keyboard) -def change_voice(engine, gender='male'): - for voice in engine.getProperty('voices'): - if VOICE_LANGUAGE in voice.languages[0].decode('utf-8') and gender == voice.gender: - engine.setProperty('voice', voice.id) - return True - -async def text_to_voice(text: str) -> BytesIO: - with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg', delete=False) as ogg_file: - temp_filename = ogg_file.name - voice_done = False +async def text_to_voice(text: str, language: str = None) -> BytesIO: + binary_path = "/home/whoami/PROJECTS/openai-telegram-bot/piper/piper" + if language is None: + language = detect(text[0:80]) - # If Google TTS is enabled, try to use it first - if ENABLE_GOOGLE_TTS: - try: - tts = gTTS(text, lang=VOICE_LANGUAGE) - tts.save(temp_filename) - voice_done = True - except Exception as e: - print("Google TTS failed, falling back to pyttsx3: --> ", e) - - # If Google TTS is disabled or failed, use pyttsx3 - if not voice_done: - engine = pyttsx3.init() - change_voice(engine) - engine.setProperty('rate', 160) - engine.save_to_file(text, temp_filename) - engine.runAndWait() - engine.stop() - # Add a small delay before reading the file - await asyncio.sleep(1) + model_path = f"/home/whoami/PROJECTS/openai-telegram-bot/piper/voices/{language}.onnx" + # Generate a unique temporary filename with '.ogg' extension + with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp: + tmp_filename = tmp.name + + text = text.replace('"', "") + # Make the text be in a single line + text = text.replace("\n", " ") + # Construct the command to execute the binary + cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}" + + # Run the binary and wait for it to finish + subprocess.run(cmd, shell=True, check=True) + + # Open the file in binary mode and read its content into BytesIO object + with open(tmp_filename, 'rb') as file: + bytes_io = BytesIO(file.read()) - with open(temp_filename, "rb") as audio_file: - voice_data = BytesIO(audio_file.read()) - - os.remove(temp_filename) - voice_data.seek(0) - return voice_data + # Delete the temporary file + os.remove(tmp_filename) + + # Return the BytesIO object + return bytes_io def restricted(func): @@ -364,7 +359,7 @@ if __name__ == '__main__': print(f"Allowed users: {ALLOWED_USERS}") print(f"System prompt: {SYSTEM_PROMPT}") - print(f"Google TTS: {ENABLE_GOOGLE_TTS}") + print(f"TTS: {ENABLE_TTS}") # Register message handler and callback query handler for settings dp.register_message_handler(settings, commands=['settings']) diff --git a/piper/init-piper.sh b/piper/init-piper.sh new file mode 100755 index 0000000..0914e80 --- /dev/null +++ b/piper/init-piper.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +source .env +if [ "$ENABLE_TTS" = 1 ]; then + echo "Installing piper for text to voice conversion..." + echo "Downloading piper v0.0.2.." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/piper_amd64.tar.gz + echo "Extracting piper" + tar -xf piper_amd64.tar.gz + + echo "Installing piper" + rm -rf piper_amd64 + rm piper_amd64.tar.gz + chmod -R 777 ./piper/ + mkdir piper/voices + + echo "Downloading tts voices from VOICE_LANGUAGE_LIST..." + echo "This can take a while..." + # Check if "en" is in $VOICE_LANGUAGE_LIST and download the english voice from the repo + for lang in $(echo $VOICE_LANGUAGE_LIST | tr "," " "); do + if [ "$lang" = "en" ] ; then + echo "Downloading english voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-ryan-high.tar.gz + tar -xf voice-en-us-ryan-high.tar.gz + mv en-us-ryan-high.onnx en.onnx + mv en-us-ryan-high.onnx.json en.onnx.json + rm -rf voice-en-us-ryan-high.tar.gz + echo "Done" + fi + if [ "$lang" = "es" ] ; then + echo "Downloading spanish voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-es-mls_10246-low.tar.gz + tar -xf voice-es-mls_10246-low.tar.gz + mv es-mls_10246-low.onnx es.onnx + mv es-mls_10246-low.onnx.json es.onnx.json + rm -rf voice-es-mls_10246-low.tar.gz + echo "Done" + fi + if [ "$lang" = "fr" ] ; then + echo "Downloading french voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-fr-siwis-medium.tar.gz + tar -xf voice-fr-siwis-medium.tar.gz + mv fr-siwis-medium.onnx fr.onnx + mv fr-siwis-medium.onnx.json fr.onnx.json + rm -rf voice-fr-siwis-medium.tar.gz + echo "Done" + fi + if [ "$lang" = "it" ]; then + echo "Downloading italian voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-it-riccardo_fasol-x-low.tar.gz + tar -xf voice-it-riccardo_fasol-x-low.tar.gz + mv it-riccardo_fasol-x-low.onnx it.onnx + mv it-riccardo_fasol-x-low.onnx.json it.onnx.json + rm -rf voice-it-riccardo_fasol-x-low.tar.gz + echo "Done" + fi + if [ "$lang" = "pt" ]; then + echo "Downloading portuguese voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-pt-br-edresson-low.tar.gz + tar -xf voice-pt-br-edresson-low.tar.gz + mv pt-br-edresson-low.onnx pt.onnx + mv pt-br-edresson-low.onnx.json pt.onnx.json + rm -rf voice-pt-br-edresson-low.tar.gz + echo "Done" + fi + if [ "$lang" = "ca" ] ; then + echo "Downloading catalan voice..." + wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-ca-upc_ona-x-low.tar.gz + tar -xf voice-ca-upc_ona-x-low.tar.gz + mv ca-upc_ona-x-low.onnx ca.onnx + mv ca-upc_ona-x-low.onnx.json ca.onnx.json + rm -rf voice-ca-upc_ona-x-low.tar.gz + echo "Done" + fi + done + echo "Moving voices to piper/voices/" + mv *.onnx* piper/voices/ + echo "Done. Piper installed!" +else + echo "TTS Disabled. No work to do..." +fi \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e49680d..0068c9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ aiogram==2.25.1 -gTTS==2.3.1 +langdetect==1.0.9 openai==0.27.2 pydub==0.25.1 python-dotenv==1.0.0 -pyttsx3==2.90 diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..a32be5e --- /dev/null +++ b/utils.py @@ -0,0 +1,20 @@ +import subprocess +import tempfile +import os + +def text_to_speech(text: str) -> str: + binary_path = "./piper" + model_path = "blizzard_lessac-medium.onnx" + + # Generate a unique temporary filename + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp_filename = tmp.name + + # Construct the command to execute the binary + cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}" + + # Run the binary and wait for it to finish + subprocess.run(cmd, shell=True, check=True) + + # Return the temporary filename + return tmp_filename