diff --git a/example.env b/example.env index 3e7e363..6d7964f 100644 --- a/example.env +++ b/example.env @@ -1,3 +1,6 @@ +# # # # # # # # # # +# OpenAI / ChatGPT # +# # # # # # # # # # OPENAI_API_KEY=your-openai-api-key OPENAI_MODEL=gpt-3.5-turbo @@ -8,11 +11,21 @@ CHATGPT_TEMPERATURE=1.0 # Use Whisper transcript from voice message with ChatGPT WHISPER_TO_GPT=1 -# TTS Options +# # # # # # # # +# TTS Options # +# # # # # # # # ENABLE_TTS=1 -# If USE_TTS=1, you can set the following options -VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca -DEFAULT_VOICE_LANGUAGE=en +# If ENABLE_TTS=1, you can set the following options +# Remove any language you don't want to not downlad its voice +VOICE_LANGUAGE_LIST=en,es,fr,it,pt,ca,no,nl,de +DEFAULT_VOICE_LANGUAGE=en + +# Do not change this line +LANGUAGES_H_SR=en,fr,nl,no + +# # # # # # # # # # +# Telegram Options # +# # # # # # # # # # BOT_TOKEN=your-telegram-bot-token BOT_ALLOWED_USERS= XXXX,YYYY # Comma separated list of Telegram user IDs diff --git a/main.py b/main.py index 3a74b69..5a986a0 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ +import asyncio import logging import os -import tempfile +import wave from functools import wraps from io import BytesIO - -import subprocess +from typing import Tuple import openai from aiogram import Bot, Dispatcher, types @@ -12,8 +12,8 @@ from aiogram.contrib.middlewares.logging import LoggingMiddleware from aiogram.types import InlineKeyboardButton, InlineKeyboardMarkup, ParseMode from aiogram.utils import executor from dotenv import load_dotenv -from pydub import AudioSegment from langdetect import detect +from pydub import AudioSegment import database @@ -46,6 +46,7 @@ WHISPER_TO_GPT = bool(int(os.environ.get("WHISPER_TO_GPT"))) ENABLE_TTS = bool(int(os.environ.get("ENABLE_TTS"))) DEFAULT_VOICE_LANGUAGE = os.environ.get("DEFAULT_VOICE_LANGUAGE") VOICE_LANGUAGE_LIST = os.environ.get("VOICE_LANGUAGE_LIST") +LANGUAGES_22050 = os.environ.get("LANGUAGES_H_SR").split(",") MAX_USER_CONTEXT = int(os.environ.get("CHATGPT_MAX_USER_CONTEXT")) openai.api_key = os.environ.get("OPENAI_API_KEY") @@ -107,7 +108,15 @@ def generate_settings_markup(chat_id: str) -> InlineKeyboardMarkup: return InlineKeyboardMarkup(inline_keyboard=keyboard) -async def text_to_voice(text: str, language: str = None) -> BytesIO: +async def send_voice_message(chat_id, assistant_message): + await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING) + audio_data, _ = await text_to_voice(assistant_message) + audio_data.seek(0) # Reset the buffer's position to the beginning + + await bot.send_voice(chat_id, audio_data) + + +async def text_to_voice(text: str, language: str = None) -> Tuple[BytesIO, str]: binary_path = "./piper/piper" if language is None: @@ -115,34 +124,30 @@ async def text_to_voice(text: str, language: str = None) -> BytesIO: model_path = f"./piper/voices/{language}.onnx" - # Generate a unique temporary filename with '.ogg' extension - with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp: - tmp_filename = tmp.name + text = text.replace("\n", ". ") - # Run the binary with the escaped text as input and the temp file as output - with open(tmp_filename, "wb") as tmp_file: - process = subprocess.Popen( - [binary_path, "--model", model_path, "--output_file", "-"], - stdin=subprocess.PIPE, - stdout=tmp_file, - stderr=subprocess.PIPE, - text=True, - encoding="utf8", - ) - - # Remove all newlines from the text so that the text is read as a single sentence - text = text.replace("\n", ". ") - process.communicate(input=text) + cmd = [binary_path, "--model", model_path, "--output_raw"] + proc = await asyncio.create_subprocess_exec( + *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE + ) - # Open the file in binary mode and read its content into BytesIO object - with open(tmp_filename, "rb") as file: - bytes_io = BytesIO(file.read()) + stdout, _ = await proc.communicate(input=text.encode("utf-8")) - # Delete the temporary file - os.remove(tmp_filename) + # Create a new BytesIO object to store the WAV file. + wav_file = BytesIO() + with wave.open(wav_file, "wb") as wf: + # Assuming 1 channel, 16 bits per sample, and 22050 samples per second. + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(22050 if language in LANGUAGES_22050 else 16000) + wf.writeframes(stdout) - # Return the BytesIO object - return bytes_io + wav_file.seek(0) + + audio = AudioSegment.from_file(wav_file, format="wav") + audio = audio.export(format="ogg", codec="libopus", parameters=["-vbr", "on"]) + + return BytesIO(audio.read()), "ogg" def restricted(func): @@ -254,7 +259,7 @@ async def usage(message: types.Message) -> None: - Generated {user_usage["dalle"]} images with DALL-E. - Transcribed {round(float(user_usage["whisper"]) / 60.0, 2)}min with Whisper. -Total spent: ${user_spent} ({user_percentage:.2f}% of total) +User total: ${user_spent} ({user_percentage:.2f}% of total) Total usage: - ChatGPT tokens: {total_usage["chatgpt"]} @@ -297,9 +302,9 @@ async def attachment(message: types.Message): await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING) transcript = {"text": ""} - audioMessage = False + # Handle media types if message.voice: user_data["usage"]["whisper"] += message.voice.duration file_id = message.voice.file_id @@ -317,16 +322,19 @@ async def attachment(message: types.Message): await message.reply("Can't handle such file. Reason: unknown.") return + # Download file from Telegram file = await bot.get_file(file_id) user_id = message.chat.id await file.download(f"{user_id}.{file_format}") + # Convert audio to mp3 if needed, because OpenAI doesn't support ogg if file_format == "ogg": ogg_audio = AudioSegment.from_file(f"{user_id}.ogg", format="ogg") ogg_audio.export(f"{user_id}.mp3", format="mp3") os.remove(f"{user_id}.ogg") file_format = "mp3" + # Transcribe audio with OpenAI API with open(f"{user_id}.{file_format}", "rb") as audio_file: try: await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING) @@ -336,25 +344,28 @@ async def attachment(message: types.Message): await message.reply("Transcript failed.") os.remove(f"{user_id}.{file_format}") return + os.remove(f"{user_id}.{file_format}") - os.remove(f"{user_id}.{file_format}") - + # Handle empty transcript if transcript["text"] == "": transcript["text"] = "[Silence]" + # If whisper_to_chat is enabled, send transcript to ChatGPT and send its response along with the transcript chatGPT_response = False if audioMessage and user_data["options"]["whisper_to_chat"]: chatGPT_response, user_data = await messageGPT( transcript["text"], str(chat_id), message.from_user.full_name, user_data ) - transcript["text"] = "> " + transcript["text"] + "\n\n" + chatGPT_response + transcript["text"] = ''.join(["> ", transcript["text"], "\n\n", chatGPT_response]) + # Send transcript (and ChatGPT response if enabled) await message.reply(transcript["text"]) + + # Send ChatGPT response as voice message with piper TTS if enabled if user_data["options"]["assistant_voice_chat"] and chatGPT_response: - await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING) - voice_data = await text_to_voice(chatGPT_response) - await message.reply_voice(voice_data) - + asyncio.create_task(send_voice_message(chat_id, chatGPT_response)) + + # Update user data database.update_user(str(chat_id), user_data) @@ -420,9 +431,7 @@ async def chat(message: types.Message): await message.reply(assistant_message, parse_mode=ParseMode.MARKDOWN) if user_data["options"]["assistant_voice_chat"]: - await bot.send_chat_action(chat_id, action=types.ChatActions.TYPING) - voice_data = await text_to_voice(assistant_message) - await message.reply_voice(voice_data) + asyncio.create_task(send_voice_message(chat_id, assistant_message)) if __name__ == "__main__":