pull/33/head
msramalho 2022-05-25 10:32:26 +02:00
rodzic 2a01038c0c
commit ea261635a2
4 zmienionych plików z 5 dodań i 57 usunięć

4
.gitignore vendored
Wyświetl plik

@ -1,4 +1,5 @@
tmp/
temp/
.env*
.DS_Store
expmt/
@ -10,4 +11,5 @@ anu.html
.pytest_cach
anon*
config.json
config-*.json
config-*.json
logs/*

Wyświetl plik

@ -1,13 +1,10 @@
import time, requests, os
import time, requests
from bs4 import BeautifulSoup
# from dataclasses import dataclass
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import WaybackConfig
# TODO: use WaybackConfig
class WaybackArchiver(Archiver):
name = "wayback"

Wyświetl plik

@ -133,7 +133,7 @@ def main():
mkdir_if_not_exists(c.tmp_folder)
process_sheet(c, c.sheet, header=c.header, columns=c.column_names)
shutil.rmtree(c.tmp_folder)
c.webdriver.quit()
c.destroy_webdriver()
if __name__ == '__main__':

51
test.py
Wyświetl plik

@ -1,51 +0,0 @@
import os
import datetime
import argparse
import requests
import shutil
import gspread
from loguru import logger
from dotenv import load_dotenv
from selenium import webdriver
import traceback
import archivers
from storages import S3Storage, S3Config
from utils import GWorksheet, mkdir_if_not_exists
load_dotenv()
options = webdriver.FirefoxOptions()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET'),
folder="temp"
)
s3_client = S3Storage(s3_config)
telegram_config = archivers.TelegramConfig(
api_id=os.getenv('TELEGRAM_API_ID'),
api_hash=os.getenv('TELEGRAM_API_HASH')
)
archiver = archivers.TelethonArchiver(s3_client, driver, telegram_config)
URLs = [
# "https://t.me/c/1226032830/24864",
# "https://t.me/truexanewsua/32650",
"https://t.me/informatsia_obstanovka/5239",
# "https://t.me/informatsia_obstanovka/5240",
# "https://t.me/informatsia_obstanovka/5241",
# "https://t.me/informatsia_obstanovka/5242"
]
for url in URLs:
print(url)
print(archiver.download(url, False))