Support for detect and remove Spam messages when captcha is not solved.

2019-04-12 20:12:17 +02:00 · 2019-04-12 20:12:17 +02:00 · 5cb4538b58
commit 5cb4538b58
--- a/sources/constants.py
+++ b/sources/constants.py
@ -12,7 +12,7 @@ Creation date:
 Last modified date:
    12/04/2019
 Version:
-    1.1.5
+    1.2.0
 '''

 ####################################################################################################
@ -30,11 +30,13 @@ CONST = {
    'INIT_ENABLE' : True, # Initial enable/disable status at Bot start
    'INIT_CAPTCHA_TIME_MIN' : 5, # Initial captcha solve time (in minutes)
    'T_DEL_MSG' : 5, # Default time (in mins) to remove self-destruct sent messages from the Bot
+    'F_TLDS' : './tlds-alpha-by-domain.txt', # IANA TLD list (https://data.iana.org/TLD/tlds-alpha-by-domain.txt)
+    'REGEX_URLS' : r'((?<=[^a-zA-Z0-9])*(?:https\:\/\/|[a-zA-Z0-9]{{1,}}\.{{1}}|\b)(?:\w{{1,}}\.{{1}}){{1,5}}(?:{})\b/?(?!@))',
    'DEVELOPER' : '@JoseTLG', # Bot developer
    'REPOSITORY' : 'https://github.com/J-Rios/TLG_JoinCaptchaBot', # Bot code repository
    'DEV_PAYPAL' : 'https://www.paypal.me/josrios', # Developer Paypal address
    'DEV_BTC' : '3N9wf3FunR6YNXonquBeWammaBZVzTXTyR', # Developer Bitcoin address
-    'VERSION' : '1.1.5 (12/04/2019)' # Bot version
+    'VERSION' : '1.2.0 (12/04/2019)' # Bot version
 }

 TEXT = {
@ -173,6 +175,15 @@ TEXT = {
            'resolve the captcha. I try to ban the "User", but for some unexpected problem ' \
            '(maybe network/server related), I can\'t do it.',

+        'SPAM_DETECTED_RM' : \
+            'Detected a message with an URL from {}, who has not solved the captcha yet. ' \
+            'The message has been removed for the sake of a Telegram free of spam :)',
+
+        'SPAM_DETECTED_NOT_RM' : \
+            'Message with an URL detected from {}, who has not solved the captcha yet. ' \
+            'I try to remove the Spam message, but I don\'t have the administration rights for ' \
+            'remove messages that has not been sent by me.',
+
        'OTHER_CAPTCHA_BTN_TEXT' : \
            'Other Captcha',

@ -364,6 +375,15 @@ TEXT = {
            'problema inesperado (quizás relacionado con la red o el servidor), no he podido ' \
            'hacerlo.',

+        'SPAM_DETECTED_RM' : \
+            'Se ha detectado un mensaje con URL enviado por {}, quien aún no ha resuelto el ' \
+            'captcha. El mensaje ha sido eliminado en aras de un Telegram libre de Spam :)',
+
+        'SPAM_DETECTED_NOT_RM' : \
+            'Se ha detectado un mensaje con URL enviado por {}, quien aún no ha resuelto el ' \
+            'captcha. He intentado borrar el mensaje, pero no se me han dado los privilegios de ' \
+            'administración necesarios para eliminar mensajes que no son míos.',
+
        'OTHER_CAPTCHA_BTN_TEXT' : \
            'Otro Captcha',

@ -552,6 +572,15 @@ TEXT = {
            'captcha. Eu tentei banir o "usuário", mas algo deu errado (talvez algo relacionado ' \
            'à rede ou servidor). Não pude fazê-lo.',

+        'SPAM_DETECTED_RM' : \
+            'Detectou uma mensagem com um URL de {}, que ainda não resolveu o captcha. ' \
+            'A mensagem foi removida por causa de um Telegram livre de spam :)',
+
+        'SPAM_DETECTED_NOT_RM' : \
+            'Detectou uma mensagem com um URL de {}, que ainda não resolveu o captcha. ' \
+            'Eu tentei apagar essa mensagem, mas eu não tenho poderes administrativos para ' \
+            'remover mensagens que não foram enviadas por mim.',
+
        'OTHER_CAPTCHA_BTN_TEXT' : \
            'Outro Captcha',

--- a/sources/join_captcha_bot.py
+++ b/sources/join_captcha_bot.py
@ -15,12 +15,13 @@ Creation date:
 Last modified date:
    12/04/2019
 Version:
-    1.1.5
+    1.2.0
 '''

 ####################################################################################################

 ### Imported modules ###
+import re
 from sys import exit
 from signal import signal, SIGTERM, SIGINT
 from os import path, remove, makedirs, listdir
@ -95,6 +96,33 @@ def initialize_resources():
                    default_conf = get_default_config_data()
                    for key, value in default_conf.items():
                        save_config_property(f_chat_id, key, value)
+    # Load and generate URL detector regex from TLD list file
+    load_urls_regex(CONST["F_TLDS"])
+
+
+def load_urls_regex(file_path):
+    '''Load URL detection Regex from IANA TLD list text file.'''
+    tlds_str = ""
+    list_file_lines = []
+    try:
+        with open(file_path, "r") as f:
+            for line in f:
+                if line is None:
+                    continue
+                if (line == "") or (line == "\r\n") or (line == "\r") or (line == "\n"):
+                    continue
+                # Ignore lines that start with # (first header line of IANA TLD list file)
+                if line[0] == "#":
+                    continue
+                line = line.lower()
+                line = line.replace("\r", "")
+                line = line.replace("\n", "|")
+                list_file_lines.append(line)
+    except Exception as e:
+        printts("Error when opening file \"{}\". {}".format(file_path, str(e)))
+    if len(list_file_lines) > 0:
+        tlds_str = "".join(list_file_lines)
+    CONST["REGEX_URLS"] = CONST["REGEX_URLS"].format(tlds_str)


 def create_image_captcha(img_file_name):
@ -554,6 +582,7 @@ def msg_nocmd(bot, update):
    chat_type = update.message.chat.type
    user_id = update.message.from_user.id
    msg_text = update.message.text
+    msg_id = update.message.message_id
    # Verify if we are in a group
    if chat_type != "private":
        # Get and update chat data
@ -629,6 +658,24 @@ def msg_nocmd(bot, update):
                                                          TEXT[lang]["CAPTCHA_INCORRECT_1"])
                                update_to_delete_join_msg_id(chat_id, user_id, "msg_id_join2", \
                                                             sent_msg_id)
+                        else:
+                            # Check if the message contains any URL
+                            has_url = re.findall(CONST["REGEX_URLS"], msg_text)
+                            if has_url:
+                                printts("[{}] - Spammer detected: {}.".format(chat_id, \
+                                        new_user["user_name"]))
+                                printts("[{}] - Removing spam message: {}.".format(chat_id, \
+                                        msg_text))
+                                # Try to remove the message and notify detection
+                                rm_result = tlg_delete_msg(bot, chat_id, msg_id)
+                                if rm_result == 1:
+                                    bot_msg = TEXT[lang]["SPAM_DETECTED_RM"].format( \
+                                              new_user["user_name"])
+                                # Check if message cant be removed due to not delete msg privileges
+                                if rm_result == -2:
+                                    bot_msg = TEXT[lang]["SPAM_DETECTED_NOT_RM"].format( \
+                                              new_user["user_name"])
+                                tlg_send_selfdestruct_msg(bot, chat_id, bot_msg)
                    break
                i = i + 1

@ -928,9 +975,9 @@ def check_time_to_kick_not_verify_users(bot):
    while i < len(new_users_list):
        new_user = new_users_list[i]
        captcha_timeout = get_chat_config(new_user["chat_id"], "Captcha_Time")
-        # Remove from new users list, the remaining kicked users that doesnt try to join group 
-        # again in 1 hour (this allows to keep join retries in same chat, so user ban just happen 
-        # if a user try to join the group and fail to solve the captcha 3 times in the past hour)
+        # Remove from new users list the remaining kicked users that have not solve the captcha in 
+        # 1 hour (user ban just happen if a user try to join the group and fail to solve the 
+        # captcha 3 times in the past hour)
        if time() >= (new_user["join_time"] + captcha_timeout*60) + 3600:
            # Remove user from new users list
            new_users_list.remove(new_user)
@ -940,7 +987,7 @@ def check_time_to_kick_not_verify_users(bot):
                chat_id = new_user["chat_id"]
                lang = get_chat_config(chat_id, "Language")
                printts(" ")
-                # Check if this "user" has try to join this chat 3 times and never get solve the captcha
+                # Check if this "user" has join this chat 3 times and never get solve the captcha
                if new_user["join_retries"] < 3:
                    printts("[{}] - Captcha not solved, kicking {} ({})...".format(chat_id, \
                        new_user["user_name"], new_user["user_id"]))
--- a/sources/tlds-alpha-by-domain.txt
+++ b/sources/tlds-alpha-by-domain.txt