Use piper for realistic TTS, remove google tts

main
pluja 2023-04-28 15:48:52 +02:00
rodzic 92fc09617a
commit f5947f03d9
9 zmienionych plików z 174 dodań i 54 usunięć

9
.gitignore vendored
Wyświetl plik

@ -1,4 +1,11 @@
venv/ venv/
.env .env
*.db *.db
__pycache__/ __pycache__/
db_data
*.ogg
piper/*.so*
piper/piper
piper/espeak*
piper/voices
MODEL_CARD

Wyświetl plik

@ -1,8 +1,16 @@
FROM python:3.10-slim FROM python:3.10-slim
RUN apt update && apt install -y ffmpeg libespeak1 # Set the voice language
ARG VOICE_LANGUAGE=en
RUN apt update && apt install -y ffmpeg wget libespeak1
WORKDIR /app WORKDIR /app
COPY ./entrypoint.sh /app
RUN chmod +x /app/entrypoint.sh
COPY ./piper /app/piper
COPY ./main.py /app COPY ./main.py /app
COPY ./database.py /app COPY ./database.py /app
COPY ./requirements.txt /app COPY ./requirements.txt /app
@ -11,4 +19,4 @@ RUN mkdir db_data
RUN pip install --upgrade pip RUN pip install --upgrade pip
RUN pip install -r requirements.txt RUN pip install -r requirements.txt
CMD [ "python3", "/app/main.py" ] ENTRYPOINT [ "/app/entrypoint.sh" ]

Wyświetl plik

@ -10,7 +10,7 @@ A telegram bot to interact with OpenAI API. You can:
- Voice chat with ChatGPT: - Voice chat with ChatGPT:
- Send voice message. - Send voice message.
- Receive voice messages. - Receive voice messages.
- Use GoogleTTS or 100% local Espeak (more robotic). - Use 100% local Text-To-Speech with Language Recognition to give ChatGPT a voice in many languages!
Other features include: Other features include:
@ -53,7 +53,7 @@ Self hosting this chatbot is pretty easy. You just need to follow this steps:
- Set your ALLOWED_USERS (comma separated user ids). Set it to `*` to allow all users. - Set your ALLOWED_USERS (comma separated user ids). Set it to `*` to allow all users.
- Set the SYSTEM_PROMPT for ChatGPT. This is always instructed to ChatGPT as the system. - Set the SYSTEM_PROMPT for ChatGPT. This is always instructed to ChatGPT as the system.
- Optional: Edit the MAX_CONTEXT. This variable sets the number of messages that will be sent to ChatGPT API as context for the conversation. - Optional: Edit the MAX_CONTEXT. This variable sets the number of messages that will be sent to ChatGPT API as context for the conversation.
- WHISPER_TO_CHAT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not. - WHISPER_TO_GPT allows you to choose wether Whisper transcripts should be instructed to ChatGPT or not.
- You can also configure this using `/settings` in chat. - You can also configure this using `/settings` in chat.
- ENABLE_GOOGLE_TTS the TTS service will be provided by GoogleTTS, producing more natural voices. If disabled, it fallsback to local voice generation using Espeak. - ENABLE_GOOGLE_TTS the TTS service will be provided by GoogleTTS, producing more natural voices. If disabled, it fallsback to local voice generation using Espeak.
- VOICE_LANGUAGE country code for the default voice accent. - VOICE_LANGUAGE country code for the default voice accent.

7
entrypoint.sh 100644
Wyświetl plik

@ -0,0 +1,7 @@
#!/bin/bash
echo "Installing piper for text to voice conversion..."
bash /app/piper/init-piper.sh
echo "Bot starting..."
python3 -u /app/main.py

Wyświetl plik

@ -6,10 +6,13 @@ CHATGPT_MAX_USER_CONTEXT=5
CHATGPT_TEMPERATURE=1.0 CHATGPT_TEMPERATURE=1.0
# Use Whisper transcript from voice message with ChatGPT # Use Whisper transcript from voice message with ChatGPT
WHISPER_TO_CHAT=1 WHISPER_TO_GPT=1
# Use Google TTS for speech to text
ENABLE_GOOGLE_TTS=0 # TTS Options
VOICE_LANGUAGE=en # en, es, fr, de, it, pt, ru, ja, ko ENABLE_TTS=1
# If USE_TTS=1, you can set the following options
VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca
DEFAULT_VOICE_LANGUAGE=en
BOT_TOKEN=your-telegram-bot-token BOT_TOKEN=your-telegram-bot-token
BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs

81
main.py
Wyświetl plik

@ -1,19 +1,19 @@
import asyncio
import logging import logging
import os import os
import tempfile import tempfile
from functools import wraps from functools import wraps
from io import BytesIO from io import BytesIO
import subprocess
import openai import openai
import pyttsx3
from aiogram import Bot, Dispatcher, types from aiogram import Bot, Dispatcher, types
from aiogram.contrib.middlewares.logging import LoggingMiddleware from aiogram.contrib.middlewares.logging import LoggingMiddleware
from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode
from aiogram.utils import executor from aiogram.utils import executor
from dotenv import load_dotenv from dotenv import load_dotenv
from gtts import gTTS
from pydub import AudioSegment from pydub import AudioSegment
from langdetect import detect
import database import database
@ -39,9 +39,13 @@ ALLOWED_USERS = os.environ.get("BOT_ALLOWED_USERS").split(",")
SYSTEM_PROMPT = os.environ.get("CHATGPT_SYSTEM_PROMPT") SYSTEM_PROMPT = os.environ.get("CHATGPT_SYSTEM_PROMPT")
TEMPERATURE = os.environ.get("CHATGPT_TEMPERATURE") TEMPERATURE = os.environ.get("CHATGPT_TEMPERATURE")
MODEL = os.environ.get("OPENAI_MODEL") MODEL = os.environ.get("OPENAI_MODEL")
WHISPER_TO_CHAT = bool(int(os.environ.get("WHISPER_TO_CHAT"))) WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT")))
ENABLE_GOOGLE_TTS = bool(int(os.environ.get("ENABLE_GOOGLE_TTS")))
VOICE_LANGUAGE = os.environ.get("VOICE_LANGUAGE") # TTS Settings
ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS")))
DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE")
VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST")
MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT")) MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT"))
openai.api_key = os.environ.get("OPENAI_API_KEY") openai.api_key = os.environ.get("OPENAI_API_KEY")
@ -52,7 +56,7 @@ async def getUserData(chat_id):
"context": [], "context": [],
"usage": {"chatgpt": 0, "whisper": 0, "dalle": 0}, "usage": {"chatgpt": 0, "whisper": 0, "dalle": 0},
"options": { "options": {
"whisper_to_chat": WHISPER_TO_CHAT, "whisper_to_chat": WHISPER_TO_GPT,
"assistant_voice_chat": False, "assistant_voice_chat": False,
"temperature": float(TEMPERATURE), "temperature": float(TEMPERATURE),
"max-context": MAX_USER_CONTEXT "max-context": MAX_USER_CONTEXT
@ -83,43 +87,34 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup:
] ]
return InlineKeyboardMarkup(inline_keyboard=keyboard) return InlineKeyboardMarkup(inline_keyboard=keyboard)
def change_voice(engine, gender='male'): async def text_to_voice(text: str, language: str = None) -> BytesIO:
for voice in engine.getProperty('voices'): binary_path = "/home/whoami/PROJECTS/openai-telegram-bot/piper/piper"
if VOICE_LANGUAGE in voice.languages[0].decode('utf-8') and gender == voice.gender: if language is None:
engine.setProperty('voice', voice.id) language = detect(text[0:80])
return True
async def text_to_voice(text: str) -> BytesIO:
with tempfile.NamedTemporaryFile(mode='wb', suffix='.ogg', delete=False) as ogg_file:
temp_filename = ogg_file.name
voice_done = False
# If Google TTS is enabled, try to use it first model_path = f"/home/whoami/PROJECTS/openai-telegram-bot/piper/voices/{language}.onnx"
if ENABLE_GOOGLE_TTS: # Generate a unique temporary filename with '.ogg' extension
try: with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
tts = gTTS(text, lang=VOICE_LANGUAGE) tmp_filename = tmp.name
tts.save(temp_filename)
voice_done = True text = text.replace('"', "")
except Exception as e: # Make the text be in a single line
print("Google TTS failed, falling back to pyttsx3: --> ", e) text = text.replace("\n", " ")
# Construct the command to execute the binary
# If Google TTS is disabled or failed, use pyttsx3 cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
if not voice_done:
engine = pyttsx3.init() # Run the binary and wait for it to finish
change_voice(engine) subprocess.run(cmd, shell=True, check=True)
engine.setProperty('rate', 160)
engine.save_to_file(text, temp_filename) # Open the file in binary mode and read its content into BytesIO object
engine.runAndWait() with open(tmp_filename, 'rb') as file:
engine.stop() bytes_io = BytesIO(file.read())
# Add a small delay before reading the file
await asyncio.sleep(1)
with open(temp_filename, "rb") as audio_file: # Delete the temporary file
voice_data = BytesIO(audio_file.read()) os.remove(tmp_filename)
os.remove(temp_filename) # Return the BytesIO object
voice_data.seek(0) return bytes_io
return voice_data
def restricted(func): def restricted(func):
@ -364,7 +359,7 @@ if __name__ == '__main__':
print(f"Allowed users: {ALLOWED_USERS}") print(f"Allowed users: {ALLOWED_USERS}")
print(f"System prompt: {SYSTEM_PROMPT}") print(f"System prompt: {SYSTEM_PROMPT}")
print(f"Google TTS: {ENABLE_GOOGLE_TTS}") print(f"TTS: {ENABLE_TTS}")
# Register message handler and callback query handler for settings # Register message handler and callback query handler for settings
dp.register_message_handler(settings, commands=['settings']) dp.register_message_handler(settings, commands=['settings'])

Wyświetl plik

@ -0,0 +1,81 @@
#!/bin/bash
source .env
if [ "$ENABLE_TTS" = 1 ]; then
echo "Installing piper for text to voice conversion..."
echo "Downloading piper v0.0.2.."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/piper_amd64.tar.gz
echo "Extracting piper"
tar -xf piper_amd64.tar.gz
echo "Installing piper"
rm -rf piper_amd64
rm piper_amd64.tar.gz
chmod -R 777 ./piper/
mkdir piper/voices
echo "Downloading tts voices from VOICE_LANGUAGE_LIST..."
echo "This can take a while..."
# Check if "en" is in $VOICE_LANGUAGE_LIST and download the english voice from the repo
for lang in $(echo $VOICE_LANGUAGE_LIST | tr "," " "); do
if [ "$lang" = "en" ] ; then
echo "Downloading english voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-ryan-high.tar.gz
tar -xf voice-en-us-ryan-high.tar.gz
mv en-us-ryan-high.onnx en.onnx
mv en-us-ryan-high.onnx.json en.onnx.json
rm -rf voice-en-us-ryan-high.tar.gz
echo "Done"
fi
if [ "$lang" = "es" ] ; then
echo "Downloading spanish voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-es-mls_10246-low.tar.gz
tar -xf voice-es-mls_10246-low.tar.gz
mv es-mls_10246-low.onnx es.onnx
mv es-mls_10246-low.onnx.json es.onnx.json
rm -rf voice-es-mls_10246-low.tar.gz
echo "Done"
fi
if [ "$lang" = "fr" ] ; then
echo "Downloading french voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-fr-siwis-medium.tar.gz
tar -xf voice-fr-siwis-medium.tar.gz
mv fr-siwis-medium.onnx fr.onnx
mv fr-siwis-medium.onnx.json fr.onnx.json
rm -rf voice-fr-siwis-medium.tar.gz
echo "Done"
fi
if [ "$lang" = "it" ]; then
echo "Downloading italian voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-it-riccardo_fasol-x-low.tar.gz
tar -xf voice-it-riccardo_fasol-x-low.tar.gz
mv it-riccardo_fasol-x-low.onnx it.onnx
mv it-riccardo_fasol-x-low.onnx.json it.onnx.json
rm -rf voice-it-riccardo_fasol-x-low.tar.gz
echo "Done"
fi
if [ "$lang" = "pt" ]; then
echo "Downloading portuguese voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-pt-br-edresson-low.tar.gz
tar -xf voice-pt-br-edresson-low.tar.gz
mv pt-br-edresson-low.onnx pt.onnx
mv pt-br-edresson-low.onnx.json pt.onnx.json
rm -rf voice-pt-br-edresson-low.tar.gz
echo "Done"
fi
if [ "$lang" = "ca" ] ; then
echo "Downloading catalan voice..."
wget -q https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-ca-upc_ona-x-low.tar.gz
tar -xf voice-ca-upc_ona-x-low.tar.gz
mv ca-upc_ona-x-low.onnx ca.onnx
mv ca-upc_ona-x-low.onnx.json ca.onnx.json
rm -rf voice-ca-upc_ona-x-low.tar.gz
echo "Done"
fi
done
echo "Moving voices to piper/voices/"
mv *.onnx* piper/voices/
echo "Done. Piper installed!"
else
echo "TTS Disabled. No work to do..."
fi

Wyświetl plik

@ -1,6 +1,5 @@
aiogram==2.25.1 aiogram==2.25.1
gTTS==2.3.1 langdetect==1.0.9
openai==0.27.2 openai==0.27.2
pydub==0.25.1 pydub==0.25.1
python-dotenv==1.0.0 python-dotenv==1.0.0
pyttsx3==2.90

20
utils.py 100644
Wyświetl plik

@ -0,0 +1,20 @@
import subprocess
import tempfile
import os
def text_to_speech(text: str) -> str:
binary_path = "./piper"
model_path = "blizzard_lessac-medium.onnx"
# Generate a unique temporary filename
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp_filename = tmp.name
# Construct the command to execute the binary
cmd = f"echo '{text}' | {binary_path} --model {model_path} --output_file {tmp_filename}"
# Run the binary and wait for it to finish
subprocess.run(cmd, shell=True, check=True)
# Return the temporary filename
return tmp_filename