Use piper for realistic TTS, remove google tts

2023-04-28 15:48:52 +02:00 · 2023-04-28 15:48:52 +02:00 · f5947f03d9
commit f5947f03d9
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,11 @@
 venv/
 .env
 *.db
-__pycache__/
+__pycache__/
 db_data
 *.ogg
 piper/*.so*
 piper/piper
 piper/espeak*
 piper/voices
 MODEL_CARD
--- a/12
+++ b/12
@ -1,8 +1,16 @@
 FROM python:3.10-slim
-RUN apt update && apt install -y ffmpeg libespeak1
+# Set the voice language
 ARG VOICE_LANGUAGE=en
 RUN apt update && apt install -y ffmpeg wget libespeak1
 WORKDIR /app
 COPY ./entrypoint.sh /app
 RUN chmod +x /app/entrypoint.sh
 COPY ./piper /app/piper
 COPY ./main.py /app
 COPY ./database.py /app
 COPY ./requirements.txt /app
@ -11,4 +19,4 @@ RUN mkdir db_data
 RUN pip install --upgrade pip
 RUN pip install -r requirements.txt
-CMD [ "python3", "/app/main.py" ]
+ENTRYPOINT [ "/app/entrypoint.sh" ]
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ A telegram bot to interact with OpenAI API. You can:
 - Voice chat with ChatGPT:
   - Send voice message.
   - Receive voice messages.
-   - Use GoogleTTS or 100% local Espeak (more robotic).
+   - Use 100% local Text-To-Speech with Language Recognition to give ChatGPT a voice in many languages!
 Other features include:
@ -53,7 +53,7 @@ Self hosting this chatbot is pretty easy. You just need to follow this steps:
      - Set your ALLOWED_USERS (comma separated user ids). Set it to `*` to allow all users.
      - Set the SYSTEM_PROMPT for ChatGPT. This is always instructed to ChatGPT as the system.
      - Optional: Edit the MAX_CONTEXT. This variable sets the number of messages that will be sent to ChatGPT API as context for the conversation.
-      - WHISPER_TO_CHAT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not.
+      - WHISPER_TO_GPT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not.
         - You can also configure this using `/settings` in chat.
      - ENABLE_GOOGLE_TTS the TTS service will be provided by GoogleTTS, producing more natural voices. If disabled, it fallsback to local voice generation using Espeak.
      - VOICE_LANGUAGE country code for the default voice accent.
--- a/entrypoint.sh
+++ b/entrypoint.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 echo "Installing piper for text to voice conversion..."
 bash /app/piper/init-piper.sh
 echo "Bot starting..."
 python3 -u /app/main.py
--- a/example.env
+++ b/example.env
@ -6,10 +6,13 @@ CHATGPT_MAX_USER_CONTEXT=5
 CHATGPT_TEMPERATURE=1.0
 # Use Whisper transcript from voice message with ChatGPT
-WHISPER_TO_CHAT=1
+WHISPER_TO_GPT=1
-# Use Google TTS for speech to text
+
-ENABLE_GOOGLE_TTS=0
+# TTS Options
-VOICE_LANGUAGE=en # en, es, fr, de, it, pt, ru, ja, ko
+ENABLE_TTS=1
 # If USE_TTS=1, you can set the following options
 VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca
 DEFAULT_VOICE_LANGUAGE=en
 BOT_TOKEN=your-telegram-bot-token
 BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs
--- a/main.py
+++ b/main.py
@ -1,19 +1,19 @@
 import asyncio
 import logging
 import os
 import tempfile
 from functools import wraps
 from io import BytesIO
 import subprocess
 import openai
 import pyttsx3
 from aiogram import Bot, Dispatcher, types
 from aiogram.contrib.middlewares.logging import LoggingMiddleware
 from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode
 from aiogram.utils import executor
 from dotenv import load_dotenv
 from gtts import gTTS
 from pydub import AudioSegment
 from langdetect import detect
 import database
@ -39,9 +39,13 @@ ALLOWED_USERS = os.environ.get("BOT_ALLOWED_USERS").split(",")
 SYSTEM_PROMPT = os.environ.get("CHATGPT_SYSTEM_PROMPT")
 TEMPERATURE = os.environ.get("CHATGPT_TEMPERATURE")
 MODEL = os.environ.get("OPENAI_MODEL")
-WHISPER_TO_CHAT = bool(int(os.environ.get("WHISPER_TO_CHAT")))
+WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT")))
-ENABLE_GOOGLE_TTS = bool(int(os.environ.get("ENABLE_GOOGLE_TTS")))
+
-VOICE_LANGUAGE = os.environ.get("VOICE_LANGUAGE")
+# TTS Settings
 ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS")))
 DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE")
 VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST")
 MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT"))
 openai.api_key = os.environ.get("OPENAI_API_KEY")
@ -52,7 +56,7 @@ async def getUserData(chat_id):
            "context": [],
            "usage": {"chatgpt": 0, "whisper": 0, "dalle": 0},
            "options": {
-                "whisper_to_chat": WHISPER_TO_CHAT,
+                "whisper_to_chat": WHISPER_TO_GPT,
                "assistant_voice_chat": False,
                "temperature": float(TEMPERATURE),
                "max-context": MAX_USER_CONTEXT
@ -83,43 +87,34 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup:
    ]
    return InlineKeyboardMarkup(inline_keyboard=keyboard)
-def change_voice(engine, gender='male'):
+async def text_to_voice(text: str, language: str = None) -> BytesIO:
-    for voice in engine.getProperty('voices'):
+    binary_path = "/home/whoami/PROJECTS/openai-telegram-bot/piper/piper"
-        if VOICE_LANGUAGE in voice.languages[0].decode('utf-8') and gender == voice.gender:
+    if language is None:
-            engine.setProperty('voice', voice.id)
+        language = detect(text[0:80])
            return True
 async def text_to_voice(text: str) -> BytesIO:
    with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg', delete=False) as ogg_file:
        temp_filename = ogg_file.name
        voice_done = False
-        # If Google TTS is enabled, try to use it first
+    model_path = f"/home/whoami/PROJECTS/openai-telegram-bot/piper/voices/{language}.onnx"
-        if ENABLE_GOOGLE_TTS:
+    # Generate a unique temporary filename with '.ogg' extension
-            try:
+    with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
-                tts = gTTS(text, lang=VOICE_LANGUAGE)
+        tmp_filename = tmp.name
-                tts.save(temp_filename)
+    
-                voice_done = True
+    text = text.replace('"', "")
-            except Exception as e:
+    # Make the text be in a single line
-                print("Google TTS failed, falling back to pyttsx3: --> ", e)
+    text = text.replace("\n", " ")    
-        
+    # Construct the command to execute the binary
-        # If Google TTS is disabled or failed, use pyttsx3
+    cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
-        if not voice_done:
+    
-            engine = pyttsx3.init()
+    # Run the binary and wait for it to finish
-            change_voice(engine)
+    subprocess.run(cmd, shell=True, check=True)
-            engine.setProperty('rate', 160)
+    
-            engine.save_to_file(text, temp_filename)
+    # Open the file in binary mode and read its content into BytesIO object
-            engine.runAndWait()
+    with open(tmp_filename, 'rb') as file:
-            engine.stop()
+        bytes_io = BytesIO(file.read())
            # Add a small delay before reading the file
            await asyncio.sleep(1)
-    with open(temp_filename, "rb") as audio_file:
+    # Delete the temporary file
-        voice_data = BytesIO(audio_file.read())
+    os.remove(tmp_filename)
-
+    
-    os.remove(temp_filename)
+    # Return the BytesIO object
-    voice_data.seek(0)
+    return bytes_io
    return voice_data
 def restricted(func):
@ -364,7 +359,7 @@ if __name__ == '__main__':
    print(f"Allowed users: {ALLOWED_USERS}")
    print(f"System prompt: {SYSTEM_PROMPT}")
-    print(f"Google TTS: {ENABLE_GOOGLE_TTS}")
+    print(f"TTS: {ENABLE_TTS}")
    # Register message handler and callback query handler for settings
    dp.register_message_handler(settings, commands=['settings'])
--- a/piper/init-piper.sh
+++ b/piper/init-piper.sh
@ -0,0 +1,81 @@
 #!/bin/bash
 source .env
 if [ "$ENABLE_TTS" = 1 ]; then
    echo "Installing piper for text to voice conversion..."
    echo "Downloading piper v0.0.2.."
    wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/piper_amd64.tar.gz
    echo "Extracting piper"
    tar -xf piper_amd64.tar.gz
    echo "Installing piper"
    rm -rf piper_amd64
    rm piper_amd64.tar.gz
    chmod -R 777 ./piper/
    mkdir piper/voices
    echo "Downloading tts voices from VOICE_LANGUAGE_LIST..."
    echo "This can take a while..."
    # Check if "en" is in $VOICE_LANGUAGE_LIST and download the english voice from the repo
    for lang in $(echo $VOICE_LANGUAGE_LIST | tr "," " "); do
        if [ "$lang" = "en" ] ; then
            echo "Downloading english voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-ryan-high.tar.gz
            tar -xf voice-en-us-ryan-high.tar.gz 
            mv en-us-ryan-high.onnx en.onnx 
            mv en-us-ryan-high.onnx.json en.onnx.json
            rm -rf voice-en-us-ryan-high.tar.gz 
            echo "Done"
        fi
        if [ "$lang" = "es" ] ; then
            echo "Downloading spanish voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-es-mls_10246-low.tar.gz
            tar -xf voice-es-mls_10246-low.tar.gz
            mv es-mls_10246-low.onnx es.onnx 
            mv es-mls_10246-low.onnx.json es.onnx.json
            rm -rf voice-es-mls_10246-low.tar.gz
            echo "Done"
        fi
        if [ "$lang" = "fr" ] ; then
            echo "Downloading french voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-fr-siwis-medium.tar.gz
            tar -xf voice-fr-siwis-medium.tar.gz
            mv fr-siwis-medium.onnx fr.onnx
            mv fr-siwis-medium.onnx.json fr.onnx.json
            rm -rf voice-fr-siwis-medium.tar.gz
            echo "Done"
        fi
        if [ "$lang" = "it" ]; then
            echo "Downloading italian voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-it-riccardo_fasol-x-low.tar.gz
            tar -xf voice-it-riccardo_fasol-x-low.tar.gz
            mv it-riccardo_fasol-x-low.onnx it.onnx
            mv it-riccardo_fasol-x-low.onnx.json it.onnx.json
            rm -rf voice-it-riccardo_fasol-x-low.tar.gz
            echo "Done"
        fi
        if [ "$lang" = "pt" ]; then
            echo "Downloading portuguese voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-pt-br-edresson-low.tar.gz
            tar -xf voice-pt-br-edresson-low.tar.gz
            mv pt-br-edresson-low.onnx pt.onnx
            mv pt-br-edresson-low.onnx.json pt.onnx.json
            rm -rf voice-pt-br-edresson-low.tar.gz
            echo "Done"
        fi
        if [ "$lang" = "ca" ] ; then
            echo "Downloading catalan voice..."
            wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-ca-upc_ona-x-low.tar.gz
            tar -xf voice-ca-upc_ona-x-low.tar.gz 
            mv ca-upc_ona-x-low.onnx ca.onnx 
            mv ca-upc_ona-x-low.onnx.json ca.onnx.json
            rm -rf voice-ca-upc_ona-x-low.tar.gz
            echo "Done"
        fi
    done
    echo "Moving voices to piper/voices/"
    mv *.onnx* piper/voices/
    echo "Done. Piper installed!"
 else
    echo "TTS Disabled. No work to do..."
 fi
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,5 @@
 aiogram==2.25.1
-gTTS==2.3.1
+langdetect==1.0.9
 openai==0.27.2
 pydub==0.25.1
 python-dotenv==1.0.0
 pyttsx3==2.90
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,20 @@
 import subprocess
 import tempfile
 import os
 def text_to_speech(text: str) -> str:
    binary_path = "./piper"
    model_path = "blizzard_lessac-medium.onnx"
    # Generate a unique temporary filename
    with tempfile.NamedTemporaryFile(delete=False) as tmp:
        tmp_filename = tmp.name
    # Construct the command to execute the binary
    cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
    # Run the binary and wait for it to finish
    subprocess.run(cmd, shell=True, check=True)
    # Return the temporary filename
    return tmp_filename