Merge branch 'main' into feat/unittest

pull/163/head
Patrick Robertson 2025-01-13 19:50:47 +01:00
commit bdfedfcf61
12 zmienionych plików z 3338 dodań i 2350 usunięć

Wyświetl plik

@ -1,4 +1,4 @@
# This workflow will upload a Python Package using Twine when a release is created
# This workflow uploads a Python Package to PyPI using Poetry when a release is created
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
# This workflow uses actions that are not certified by GitHub.
@ -21,30 +21,34 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set up Python 3.10
- name: Extract Python Version from pyproject.toml
id: python-version
run: |
version=$(grep 'python =' pyproject.toml | awk -F'"' '{print $2}' | tr -d '^~<=>')
echo "python-version=$version" >> $GITHUB_ENV
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: ${{ env.python-version }}
- name: Install Poetry
run: |
python -m pip install --upgrade pip
python -m pip install "poetry>=2.0.0,<3.0.0"
- name: Install dependencies
run: |
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine pipenv
python -m pip install -e . --upgrade
python -m pipenv install --dev --python 3.10
env:
PIPENV_DEFAULT_PYTHON_VERSION: "3.10"
poetry install --no-root
- name: Build wheels
- name: Build the package
run: |
python -m pipenv run python setup.py sdist bdist_wheel
poetry build
- name: Publish a Python distribution to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
verbose: true
skip_existing: true
password: ${{ secrets.PYPI_API_TOKEN }}
packages_dir: dist/
# Step 6: Publish to PyPI
- name: Publish to PyPI
run: |
poetry publish --username __token__ --password ${{ secrets.PYPI_API_TOKEN }}

1
.gitignore vendored
Wyświetl plik

@ -29,3 +29,4 @@ auto_archiver.egg-info*
logs*
*.csv
archived/
dist*

Wyświetl plik

@ -1,30 +1,58 @@
FROM webrecorder/browsertrix-crawler:1.0.4
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
ENV RUNNING_IN_DOCKER=1
ENV RUNNING_IN_DOCKER=1 \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONFAULTHANDLER=1 \
PATH="/root/.local/bin:$PATH"
# Installing system dependencies
RUN add-apt-repository ppa:mozillateam/ppa && \
apt-get update && \
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
apt-get install -y --no-install-recommends firefox-esr && \
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v* && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Poetry and runtime
FROM base AS runtime
ENV POETRY_NO_INTERACTION=1 \
POETRY_VIRTUALENVS_IN_PROJECT=1 \
POETRY_VIRTUALENVS_CREATE=1
RUN pip install --upgrade pip && \
pip install "poetry>=2.0.0,<3.0.0"
WORKDIR /app
RUN pip install --upgrade pip && \
pip install pipenv && \
add-apt-repository ppa:mozillateam/ppa && \
apt-get update && \
apt-get install -y gcc ffmpeg fonts-noto exiftool && \
apt-get install -y --no-install-recommends firefox-esr && \
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
tar -xvzf geckodriver* -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-v*
COPY pyproject.toml poetry.lock README.md ./
# Copy dependency files and install dependencies (excluding the package itself)
RUN poetry install --only main --no-root --no-cache
COPY Pipfile* ./
# install from pipenv, with browsertrix-only requirements
RUN pipenv install
# Copy code: This is needed for poetry to install the package itself,
# but the environment should be cached from the previous step if toml and lock files haven't changed
COPY ./src/ .
RUN poetry install --only main --no-cache
# doing this at the end helps during development, builds are quick
COPY ./src/ .
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
# Update PATH to include virtual environment binaries
# Allowing entry point to run the application directly with Python
ENV VIRTUAL_ENV=/app/.venv \
PATH="/app/.venv/bin:$PATH"
ENTRYPOINT ["python3", "-m", "auto_archiver"]
# should be executed with 2 volumes (3 if local_storage is used)
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml

51
Pipfile
Wyświetl plik

@ -1,51 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
gspread = "*"
boto3 = "*"
argparse = "*"
beautifulsoup4 = "*"
tiktok-downloader = "*"
bs4 = "*"
loguru = "*"
ffmpeg-python = "*"
selenium = "*"
snscrape = "*"
telethon = "*"
google-api-python-client = "*"
google-auth-httplib2 = "*"
google-auth-oauthlib = "*"
oauth2client = "*"
pdqhash = "*"
pillow = "*"
python-slugify = "*"
pyyaml = "*"
dateparser = "*"
python-twitter-v2 = "*"
instaloader = "*"
tqdm = "*"
jinja2 = "*"
cryptography = "*"
dataclasses-json = "*"
vk-url-scraper = "*"
requests = {extras = ["socks"], version = "*"}
warcio = "*"
jsonlines = "*"
pysubs2 = "*"
minify-html = "*"
retrying = "*"
tsp-client = "*"
certvalidator = "*"
numpy = "==2.1.3"
pyopenssl = "==24.2.1"
yt-dlp = "==2024.09.27"
[dev-packages]
autopep8 = "*"
setuptools-pipfile = "*"
[requires]
python_version = "3.10"

2186
Pipfile.lock wygenerowano

Plik diff jest za duży Load Diff

Wyświetl plik

@ -50,7 +50,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.10 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver`
2. install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@ -68,12 +68,13 @@ This can also be used for development.
Install the following locally:
1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
3. [Poetry](https://python-poetry.org/docs/#installation) for dependency management and packaging.
4. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
Clone and run:
1. `git clone https://github.com/bellingcat/auto-archiver`
2. `pipenv install`
3. `pipenv run python -m src.auto_archiver --config secrets/orchestration.yaml`
2. `poetry install`
3. `poetry run python -m src.auto_archiver --config secrets/orchestration.yaml`
</details><br/>
@ -119,7 +120,7 @@ auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,u
```
Here's the complete workflow that the auto-archiver goes through:
```mermaid
```{mermaid}
graph TD
s((start)) --> F(fa:fa-table Feeder)
F -->|get and clean URL| D1{fa:fa-database Database}

3177
poetry.lock wygenerowano 100644

Plik diff jest za duży Load Diff

Wyświetl plik

@ -1,4 +1,76 @@
[build-system]
requires = ["setuptools", "wheel", "setuptools-pipfile"]
build-backend = "setuptools.build_meta"
[tool.setuptools-pipfile]
requires = ["poetry-core>=2.0.0,<3.0.0"]
build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "0.13.0"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"
license = "MIT"
authors = [
{ name = "Bellingcat", email = "tech@bellingcat.com" },
]
readme = "README.md"
keywords = ["archive", "oosi", "osint", "scraping"]
classifiers = [
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3"
]
dependencies = [
"gspread (>=0.0.0)",
"argparse (>=0.0.0)",
"beautifulsoup4 (>=0.0.0)",
"tiktok-downloader (>=0.0.0)",
"bs4 (>=0.0.0)",
"loguru (>=0.0.0)",
"ffmpeg-python (>=0.0.0)",
"selenium (>=0.0.0)",
"telethon (>=0.0.0)",
"google-api-python-client (>=0.0.0)",
"google-auth-httplib2 (>=0.0.0)",
"google-auth-oauthlib (>=0.0.0)",
"oauth2client (>=0.0.0)",
"pdqhash (>=0.0.0)",
"pillow (>=0.0.0)",
"python-slugify (>=0.0.0)",
"pyyaml (>=0.0.0)",
"dateparser (>=0.0.0)",
"python-twitter-v2 (>=0.0.0)",
"instaloader (>=0.0.0)",
"tqdm (>=0.0.0)",
"jinja2 (>=0.0.0)",
"pyOpenSSL (==24.2.1)",
"cryptography (>=41.0.0,<42.0.0)",
"boto3 (>=1.28.0,<2.0.0)",
"dataclasses-json (>=0.0.0)",
"yt-dlp (==2024.09.27)",
"numpy (==2.1.3)",
"vk-url-scraper (>=0.0.0)",
"requests[socks] (>=0.0.0)",
"warcio (>=0.0.0)",
"jsonlines (>=0.0.0)",
"pysubs2 (>=0.0.0)",
"minify-html (>=0.0.0)",
"retrying (>=0.0.0)",
"tsp-client (>=0.0.0)",
"certvalidator (>=0.0.0)",
"toml (>=0.10.2,<0.11.0)"
]
[poetry.group.dev.dependencies]
autopep8 = "*"
[project.scripts]
auto-archiver = "auto_archiver.__main__:main"
[project.urls]
homepage = "https://github.com/bellingcat/auto-archiver"
repository = "https://github.com/bellingcat/auto-archiver"
documentation = "https://github.com/bellingcat/auto-archiver"

Wyświetl plik

@ -1,53 +0,0 @@
[metadata]
name = auto_archiver
version = attr: auto_archiver.version.__version__
author = Bellingcat
author_email = tech@bellingcat.com
description = Easily archive online media content
long_description = file: README.md
long_description_content_type = text/markdown
keywords = archive, oosi, osint, scraping
license = MIT
classifiers =
Intended Audience :: Developers
Intended Audience :: Science/Research
License :: OSI Approved :: MIT License
Programming Language :: Python :: 3
project_urls =
Source Code = https://github.com/bellingcat/auto-archiver
Bug Tracker = https://github.com/bellingcat/auto-archiver/issues
Bellingcat = https://www.bellingcat.com
platforms = any
[options]
setup_requires =
setuptools-pipfile
zip_safe = False
package_dir=
=src
packages=find:
find_packages=true
python_requires = >=3.10
[options.package_data]
* = *.html
[options.entry_points]
console_scripts =
auto-archiver = auto_archiver.__main__:main
# [options.extras_require]
# pdf = ReportLab>=1.2; RXP
# rest = docutils>=0.3; pack ==1.1, ==1.3
[options.packages.find]
where=src
# include=auto_archiver*
# exclude =
# examples*
# .eggs*
# build*
# secrets*
# tmp*
# docs*
# src.tests*

Wyświetl plik

@ -1,4 +0,0 @@
from setuptools import setup
if __name__ == "__main__":
setup()

Wyświetl plik

@ -162,8 +162,7 @@ class TwitterArchiver(Archiver):
.set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
result.status = "twitter-ytdl"
return result
return result.success("twitter-ytdl")
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
mimetype = ""

Wyświetl plik

@ -1,12 +1,12 @@
""" Version information for the auto_archiver package.
TODO: This is a placeholder to replicate previous versioning.
"""
from importlib.metadata import version as get_version
VERSION_SHORT = get_version("auto_archiver")
_MAJOR = "0"
_MINOR = "13"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
__version__ = f"{VERSION_SHORT}{_SUFFIX}"