kopia lustrzana https://github.com/bellingcat/auto-archiver
Merge branch 'main' into feat/unittest
commit
bdfedfcf61
|
@ -1,4 +1,4 @@
|
||||||
# This workflow will upload a Python Package using Twine when a release is created
|
# This workflow uploads a Python Package to PyPI using Poetry when a release is created
|
||||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||||
|
|
||||||
# This workflow uses actions that are not certified by GitHub.
|
# This workflow uses actions that are not certified by GitHub.
|
||||||
|
@ -21,30 +21,34 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- name: Checkout Repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Set up Python 3.10
|
- name: Extract Python Version from pyproject.toml
|
||||||
|
id: python-version
|
||||||
|
run: |
|
||||||
|
version=$(grep 'python =' pyproject.toml | awk -F'"' '{print $2}' | tr -d '^~<=>')
|
||||||
|
echo "python-version=$version" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: ${{ env.python-version }}
|
||||||
|
|
||||||
|
- name: Install Poetry
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
python -m pip install "poetry>=2.0.0,<3.0.0"
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade --upgrade-strategy=eager pip setuptools wheel twine pipenv
|
poetry install --no-root
|
||||||
python -m pip install -e . --upgrade
|
|
||||||
python -m pipenv install --dev --python 3.10
|
|
||||||
env:
|
|
||||||
PIPENV_DEFAULT_PYTHON_VERSION: "3.10"
|
|
||||||
|
|
||||||
- name: Build wheels
|
- name: Build the package
|
||||||
run: |
|
run: |
|
||||||
python -m pipenv run python setup.py sdist bdist_wheel
|
poetry build
|
||||||
|
|
||||||
- name: Publish a Python distribution to PyPI
|
# Step 6: Publish to PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
- name: Publish to PyPI
|
||||||
with:
|
run: |
|
||||||
user: __token__
|
poetry publish --username __token__ --password ${{ secrets.PYPI_API_TOKEN }}
|
||||||
verbose: true
|
|
||||||
skip_existing: true
|
|
||||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
||||||
packages_dir: dist/
|
|
||||||
|
|
|
@ -29,3 +29,4 @@ auto_archiver.egg-info*
|
||||||
logs*
|
logs*
|
||||||
*.csv
|
*.csv
|
||||||
archived/
|
archived/
|
||||||
|
dist*
|
||||||
|
|
66
Dockerfile
66
Dockerfile
|
@ -1,30 +1,58 @@
|
||||||
FROM webrecorder/browsertrix-crawler:1.0.4
|
FROM webrecorder/browsertrix-crawler:1.0.4 AS base
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1
|
ENV RUNNING_IN_DOCKER=1 \
|
||||||
|
LANG=C.UTF-8 \
|
||||||
|
LC_ALL=C.UTF-8 \
|
||||||
|
PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONFAULTHANDLER=1 \
|
||||||
|
PATH="/root/.local/bin:$PATH"
|
||||||
|
|
||||||
|
# Installing system dependencies
|
||||||
|
RUN add-apt-repository ppa:mozillateam/ppa && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool && \
|
||||||
|
apt-get install -y --no-install-recommends firefox-esr && \
|
||||||
|
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
||||||
|
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
||||||
|
tar -xvzf geckodriver* -C /usr/local/bin && \
|
||||||
|
chmod +x /usr/local/bin/geckodriver && \
|
||||||
|
rm geckodriver-v* && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
||||||
|
# Poetry and runtime
|
||||||
|
FROM base AS runtime
|
||||||
|
|
||||||
|
ENV POETRY_NO_INTERACTION=1 \
|
||||||
|
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
||||||
|
POETRY_VIRTUALENVS_CREATE=1
|
||||||
|
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip && \
|
||||||
|
pip install "poetry>=2.0.0,<3.0.0"
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN pip install --upgrade pip && \
|
|
||||||
pip install pipenv && \
|
COPY pyproject.toml poetry.lock README.md ./
|
||||||
add-apt-repository ppa:mozillateam/ppa && \
|
# Copy dependency files and install dependencies (excluding the package itself)
|
||||||
apt-get update && \
|
RUN poetry install --only main --no-root --no-cache
|
||||||
apt-get install -y gcc ffmpeg fonts-noto exiftool && \
|
|
||||||
apt-get install -y --no-install-recommends firefox-esr && \
|
|
||||||
ln -s /usr/bin/firefox-esr /usr/bin/firefox && \
|
|
||||||
wget https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz && \
|
|
||||||
tar -xvzf geckodriver* -C /usr/local/bin && \
|
|
||||||
chmod +x /usr/local/bin/geckodriver && \
|
|
||||||
rm geckodriver-v*
|
|
||||||
|
|
||||||
|
|
||||||
COPY Pipfile* ./
|
# Copy code: This is needed for poetry to install the package itself,
|
||||||
# install from pipenv, with browsertrix-only requirements
|
# but the environment should be cached from the previous step if toml and lock files haven't changed
|
||||||
RUN pipenv install
|
|
||||||
|
|
||||||
# doing this at the end helps during development, builds are quick
|
|
||||||
COPY ./src/ .
|
COPY ./src/ .
|
||||||
|
RUN poetry install --only main --no-cache
|
||||||
|
|
||||||
ENTRYPOINT ["pipenv", "run", "python3", "-m", "auto_archiver"]
|
|
||||||
|
# Update PATH to include virtual environment binaries
|
||||||
|
# Allowing entry point to run the application directly with Python
|
||||||
|
ENV VIRTUAL_ENV=/app/.venv \
|
||||||
|
PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
ENTRYPOINT ["python3", "-m", "auto_archiver"]
|
||||||
|
|
||||||
# should be executed with 2 volumes (3 if local_storage is used)
|
# should be executed with 2 volumes (3 if local_storage is used)
|
||||||
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
# docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
|
||||||
|
|
||||||
|
|
51
Pipfile
51
Pipfile
|
@ -1,51 +0,0 @@
|
||||||
[[source]]
|
|
||||||
url = "https://pypi.org/simple"
|
|
||||||
verify_ssl = true
|
|
||||||
name = "pypi"
|
|
||||||
|
|
||||||
[packages]
|
|
||||||
gspread = "*"
|
|
||||||
boto3 = "*"
|
|
||||||
argparse = "*"
|
|
||||||
beautifulsoup4 = "*"
|
|
||||||
tiktok-downloader = "*"
|
|
||||||
bs4 = "*"
|
|
||||||
loguru = "*"
|
|
||||||
ffmpeg-python = "*"
|
|
||||||
selenium = "*"
|
|
||||||
snscrape = "*"
|
|
||||||
telethon = "*"
|
|
||||||
google-api-python-client = "*"
|
|
||||||
google-auth-httplib2 = "*"
|
|
||||||
google-auth-oauthlib = "*"
|
|
||||||
oauth2client = "*"
|
|
||||||
pdqhash = "*"
|
|
||||||
pillow = "*"
|
|
||||||
python-slugify = "*"
|
|
||||||
pyyaml = "*"
|
|
||||||
dateparser = "*"
|
|
||||||
python-twitter-v2 = "*"
|
|
||||||
instaloader = "*"
|
|
||||||
tqdm = "*"
|
|
||||||
jinja2 = "*"
|
|
||||||
cryptography = "*"
|
|
||||||
dataclasses-json = "*"
|
|
||||||
vk-url-scraper = "*"
|
|
||||||
requests = {extras = ["socks"], version = "*"}
|
|
||||||
warcio = "*"
|
|
||||||
jsonlines = "*"
|
|
||||||
pysubs2 = "*"
|
|
||||||
minify-html = "*"
|
|
||||||
retrying = "*"
|
|
||||||
tsp-client = "*"
|
|
||||||
certvalidator = "*"
|
|
||||||
numpy = "==2.1.3"
|
|
||||||
pyopenssl = "==24.2.1"
|
|
||||||
yt-dlp = "==2024.09.27"
|
|
||||||
|
|
||||||
[dev-packages]
|
|
||||||
autopep8 = "*"
|
|
||||||
setuptools-pipfile = "*"
|
|
||||||
|
|
||||||
[requires]
|
|
||||||
python_version = "3.10"
|
|
Plik diff jest za duży
Load Diff
11
README.md
11
README.md
|
@ -50,7 +50,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
||||||
<details><summary><code>Python package instructions</code></summary>
|
<details><summary><code>Python package instructions</code></summary>
|
||||||
|
|
||||||
1. make sure you have python 3.10 or higher installed
|
1. make sure you have python 3.10 or higher installed
|
||||||
2. install the package `pip/pipenv/conda install auto-archiver`
|
2. install the package with your preferred package manager: `pip/pipenv/conda install auto-archiver` or `poetry add auto-archiver`
|
||||||
3. test it's installed with `auto-archiver --help`
|
3. test it's installed with `auto-archiver --help`
|
||||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||||
|
|
||||||
|
@ -68,12 +68,13 @@ This can also be used for development.
|
||||||
Install the following locally:
|
Install the following locally:
|
||||||
1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
1. [ffmpeg](https://www.ffmpeg.org/) must also be installed locally for this tool to work.
|
||||||
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
2. [firefox](https://www.mozilla.org/en-US/firefox/new/) and [geckodriver](https://github.com/mozilla/geckodriver/releases) on a path folder like `/usr/local/bin`.
|
||||||
3. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
3. [Poetry](https://python-poetry.org/docs/#installation) for dependency management and packaging.
|
||||||
|
4. (optional) [fonts-noto](https://fonts.google.com/noto) to deal with multiple unicode characters during selenium/geckodriver's screenshots: `sudo apt install fonts-noto -y`.
|
||||||
|
|
||||||
Clone and run:
|
Clone and run:
|
||||||
1. `git clone https://github.com/bellingcat/auto-archiver`
|
1. `git clone https://github.com/bellingcat/auto-archiver`
|
||||||
2. `pipenv install`
|
2. `poetry install`
|
||||||
3. `pipenv run python -m src.auto_archiver --config secrets/orchestration.yaml`
|
3. `poetry run python -m src.auto_archiver --config secrets/orchestration.yaml`
|
||||||
|
|
||||||
|
|
||||||
</details><br/>
|
</details><br/>
|
||||||
|
@ -119,7 +120,7 @@ auto-archiver --config secrets/orchestration.yaml --cli_feeder.urls="url1,url2,u
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's the complete workflow that the auto-archiver goes through:
|
Here's the complete workflow that the auto-archiver goes through:
|
||||||
```mermaid
|
```{mermaid}
|
||||||
graph TD
|
graph TD
|
||||||
s((start)) --> F(fa:fa-table Feeder)
|
s((start)) --> F(fa:fa-table Feeder)
|
||||||
F -->|get and clean URL| D1{fa:fa-database Database}
|
F -->|get and clean URL| D1{fa:fa-database Database}
|
||||||
|
|
Plik diff jest za duży
Load Diff
|
@ -1,4 +1,76 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "wheel", "setuptools-pipfile"]
|
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "poetry.core.masonry.api"
|
||||||
[tool.setuptools-pipfile]
|
|
||||||
|
[project]
|
||||||
|
name = "auto-archiver"
|
||||||
|
version = "0.13.0"
|
||||||
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
|
requires-python = ">=3.10,<3.13"
|
||||||
|
license = "MIT"
|
||||||
|
authors = [
|
||||||
|
{ name = "Bellingcat", email = "tech@bellingcat.com" },
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["archive", "oosi", "osint", "scraping"]
|
||||||
|
classifiers = [
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3"
|
||||||
|
]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
"gspread (>=0.0.0)",
|
||||||
|
"argparse (>=0.0.0)",
|
||||||
|
"beautifulsoup4 (>=0.0.0)",
|
||||||
|
"tiktok-downloader (>=0.0.0)",
|
||||||
|
"bs4 (>=0.0.0)",
|
||||||
|
"loguru (>=0.0.0)",
|
||||||
|
"ffmpeg-python (>=0.0.0)",
|
||||||
|
"selenium (>=0.0.0)",
|
||||||
|
"telethon (>=0.0.0)",
|
||||||
|
"google-api-python-client (>=0.0.0)",
|
||||||
|
"google-auth-httplib2 (>=0.0.0)",
|
||||||
|
"google-auth-oauthlib (>=0.0.0)",
|
||||||
|
"oauth2client (>=0.0.0)",
|
||||||
|
"pdqhash (>=0.0.0)",
|
||||||
|
"pillow (>=0.0.0)",
|
||||||
|
"python-slugify (>=0.0.0)",
|
||||||
|
"pyyaml (>=0.0.0)",
|
||||||
|
"dateparser (>=0.0.0)",
|
||||||
|
"python-twitter-v2 (>=0.0.0)",
|
||||||
|
"instaloader (>=0.0.0)",
|
||||||
|
"tqdm (>=0.0.0)",
|
||||||
|
"jinja2 (>=0.0.0)",
|
||||||
|
"pyOpenSSL (==24.2.1)",
|
||||||
|
"cryptography (>=41.0.0,<42.0.0)",
|
||||||
|
"boto3 (>=1.28.0,<2.0.0)",
|
||||||
|
"dataclasses-json (>=0.0.0)",
|
||||||
|
"yt-dlp (==2024.09.27)",
|
||||||
|
"numpy (==2.1.3)",
|
||||||
|
"vk-url-scraper (>=0.0.0)",
|
||||||
|
"requests[socks] (>=0.0.0)",
|
||||||
|
"warcio (>=0.0.0)",
|
||||||
|
"jsonlines (>=0.0.0)",
|
||||||
|
"pysubs2 (>=0.0.0)",
|
||||||
|
"minify-html (>=0.0.0)",
|
||||||
|
"retrying (>=0.0.0)",
|
||||||
|
"tsp-client (>=0.0.0)",
|
||||||
|
"certvalidator (>=0.0.0)",
|
||||||
|
"toml (>=0.10.2,<0.11.0)"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[poetry.group.dev.dependencies]
|
||||||
|
autopep8 = "*"
|
||||||
|
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
auto-archiver = "auto_archiver.__main__:main"
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
homepage = "https://github.com/bellingcat/auto-archiver"
|
||||||
|
repository = "https://github.com/bellingcat/auto-archiver"
|
||||||
|
documentation = "https://github.com/bellingcat/auto-archiver"
|
53
setup.cfg
53
setup.cfg
|
@ -1,53 +0,0 @@
|
||||||
[metadata]
|
|
||||||
name = auto_archiver
|
|
||||||
version = attr: auto_archiver.version.__version__
|
|
||||||
author = Bellingcat
|
|
||||||
author_email = tech@bellingcat.com
|
|
||||||
description = Easily archive online media content
|
|
||||||
long_description = file: README.md
|
|
||||||
long_description_content_type = text/markdown
|
|
||||||
keywords = archive, oosi, osint, scraping
|
|
||||||
license = MIT
|
|
||||||
classifiers =
|
|
||||||
Intended Audience :: Developers
|
|
||||||
Intended Audience :: Science/Research
|
|
||||||
License :: OSI Approved :: MIT License
|
|
||||||
Programming Language :: Python :: 3
|
|
||||||
project_urls =
|
|
||||||
Source Code = https://github.com/bellingcat/auto-archiver
|
|
||||||
Bug Tracker = https://github.com/bellingcat/auto-archiver/issues
|
|
||||||
Bellingcat = https://www.bellingcat.com
|
|
||||||
platforms = any
|
|
||||||
|
|
||||||
[options]
|
|
||||||
setup_requires =
|
|
||||||
setuptools-pipfile
|
|
||||||
zip_safe = False
|
|
||||||
package_dir=
|
|
||||||
=src
|
|
||||||
packages=find:
|
|
||||||
find_packages=true
|
|
||||||
python_requires = >=3.10
|
|
||||||
|
|
||||||
[options.package_data]
|
|
||||||
* = *.html
|
|
||||||
|
|
||||||
[options.entry_points]
|
|
||||||
console_scripts =
|
|
||||||
auto-archiver = auto_archiver.__main__:main
|
|
||||||
|
|
||||||
# [options.extras_require]
|
|
||||||
# pdf = ReportLab>=1.2; RXP
|
|
||||||
# rest = docutils>=0.3; pack ==1.1, ==1.3
|
|
||||||
|
|
||||||
[options.packages.find]
|
|
||||||
where=src
|
|
||||||
# include=auto_archiver*
|
|
||||||
# exclude =
|
|
||||||
# examples*
|
|
||||||
# .eggs*
|
|
||||||
# build*
|
|
||||||
# secrets*
|
|
||||||
# tmp*
|
|
||||||
# docs*
|
|
||||||
# src.tests*
|
|
4
setup.py
4
setup.py
|
@ -1,4 +0,0 @@
|
||||||
from setuptools import setup
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
setup()
|
|
|
@ -162,8 +162,7 @@ class TwitterArchiver(Archiver):
|
||||||
.set_timestamp(timestamp)
|
.set_timestamp(timestamp)
|
||||||
if not tweet.get("entities", {}).get("media"):
|
if not tweet.get("entities", {}).get("media"):
|
||||||
logger.debug('No media found, archiving tweet text only')
|
logger.debug('No media found, archiving tweet text only')
|
||||||
result.status = "twitter-ytdl"
|
return result.success("twitter-ytdl")
|
||||||
return result
|
|
||||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
mimetype = ""
|
mimetype = ""
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
|
""" Version information for the auto_archiver package.
|
||||||
|
TODO: This is a placeholder to replicate previous versioning.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from importlib.metadata import version as get_version
|
||||||
|
|
||||||
|
VERSION_SHORT = get_version("auto_archiver")
|
||||||
|
|
||||||
_MAJOR = "0"
|
|
||||||
_MINOR = "13"
|
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
|
||||||
# released build.
|
|
||||||
_PATCH = "1"
|
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
__version__ = f"{VERSION_SHORT}{_SUFFIX}"
|
||||||
VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
|
|
||||||
__version__ = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
|
|
Ładowanie…
Reference in New Issue