kopia lustrzana https://github.com/bellingcat/auto-archiver
Update auto archiver docs with new header declaration method
rodzic
866c4fa7fd
commit
339f62fade
1
Pipfile
1
Pipfile
|
@ -10,6 +10,7 @@ python-dotenv = "*"
|
|||
youtube_dl = "*"
|
||||
argparse = "*"
|
||||
ffmpeg-python = "*"
|
||||
beautifulsoup4 = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "a5308ff7514ddcff08e39ae06fe8f96e63ea3eecfbf4c106c907c8aa6b76b8dd"
|
||||
"sha256": "117af2367c550817e099dd85a55aad372e90f7e8f8940fdc69a8f8d052c3ea7b"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -24,29 +24,38 @@
|
|||
"index": "pypi",
|
||||
"version": "==1.4.0"
|
||||
},
|
||||
"boto3": {
|
||||
"beautifulsoup4": {
|
||||
"hashes": [
|
||||
"sha256:d39c04b51e60197f5503f8489f043bc904981567cc8431d389367767dc3fd5ae",
|
||||
"sha256:fe1898c5b10035528207995c9931b78f2f50bb70cf93bac353152aea47c04780"
|
||||
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
|
||||
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
|
||||
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.17.30"
|
||||
"version": "==4.9.3"
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:d856a71d74351649ca8dd59ad17c8c3e79ea57734ff4a38a97611e1e10b06863",
|
||||
"sha256:da1b2c884dbf56cc3ece07940a7b654f41a93b9fc40ee1ed21a76da25a05989c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.17.62"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:63951595a736dfc9759f57e33bec6eaea4f09c4800626ef5309437060b263e48",
|
||||
"sha256:98ff1eb210d394a1ffe736b33c8a7be68f30f0a03550b559c5bb6fdf0c29328d"
|
||||
"sha256:e4f8cb923edf035c2ae5f6169c70e77e31df70b88919b92b826a6b9bd14511b1",
|
||||
"sha256:f7c2c5c5ed5212b2628d8fb1c587b31c6e8d413ecbbd1a1cdf6f96ed6f5c8d5e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==1.20.30"
|
||||
"version": "==1.20.62"
|
||||
},
|
||||
"cachetools": {
|
||||
"hashes": [
|
||||
"sha256:1d9d5f567be80f7c07d765e21b814326d78c61eb0c3a637dffc0e5d1796cb2e2",
|
||||
"sha256:f469e29e7aa4cff64d8de4aad95ce76de8ea1125a16c68e0d93f65c3c3dc92e9"
|
||||
"sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001",
|
||||
"sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff"
|
||||
],
|
||||
"markers": "python_version ~= '3.5'",
|
||||
"version": "==4.2.1"
|
||||
"version": "==4.2.2"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
|
@ -80,19 +89,19 @@
|
|||
},
|
||||
"google-auth": {
|
||||
"hashes": [
|
||||
"sha256:9bd436d19ab047001a1340720d2b629eb96dd503258c524921ec2af3ee88a80e",
|
||||
"sha256:dcaba3aa9d4e0e96fd945bf25a86b6f878fcb05770b67adbeb50a63ca4d28a5e"
|
||||
"sha256:588bdb03a41ecb4978472b847881e5518b5d9ec6153d3d679aa127a55e13b39f",
|
||||
"sha256:9ad25fba07f46a628ad4d0ca09f38dcb262830df2ac95b217f9b0129c9e42206"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==1.28.0"
|
||||
"version": "==1.30.0"
|
||||
},
|
||||
"google-auth-oauthlib": {
|
||||
"hashes": [
|
||||
"sha256:54431535309cfab50897d9c181e8c2226268825aa6e42e930b05b99c5041a18c",
|
||||
"sha256:dabffbf594a6be2fd6d054060846d1201569252efb10dfb749b504a7591f8af0"
|
||||
"sha256:09832c6e75032f93818edf1affe4746121d640c625a5bef9b5c96af676e98eee",
|
||||
"sha256:0e92aacacfb94978de3b7972cf4b0f204c3cd206f74ddd0dc0b31e91164e6317"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.4.3"
|
||||
"version": "==0.4.4"
|
||||
},
|
||||
"gspread": {
|
||||
"hashes": [
|
||||
|
@ -172,11 +181,11 @@
|
|||
},
|
||||
"python-dotenv": {
|
||||
"hashes": [
|
||||
"sha256:0c8d1b80d1a1e91717ea7d526178e3882732420b03f08afea0406db6402e220e",
|
||||
"sha256:587825ed60b1711daea4832cf37524dfd404325b7db5e25ebe88c495c9f807a0"
|
||||
"sha256:00aa34e92d992e9f8383730816359647f358f4a3be1ba45e5a5cefd27ee91544",
|
||||
"sha256:b1ae5e9643d5ed987fc57cc2583021e38db531946518130777734f9589b3141f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.15.0"
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
|
@ -204,10 +213,10 @@
|
|||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:1e28620e5b444652ed752cf87c7e0cb15b0e578972568c6609f0f18212f259ed",
|
||||
"sha256:7fdddb4f22275cf1d32129e21f056337fd2a80b6ccef1664528145b72c49e6d2"
|
||||
"sha256:9b3752887a2880690ce628bc263d6d13a3864083aeacff4890c1c9839a5eb0bc",
|
||||
"sha256:cb022f4b16551edebbb31a377d3f09600dbada7363d8c5db7976e7f47732e1b2"
|
||||
],
|
||||
"version": "==0.3.4"
|
||||
"version": "==0.4.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
|
@ -217,6 +226,14 @@
|
|||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
|
||||
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
|
||||
],
|
||||
"markers": "python_version >= '3.0'",
|
||||
"version": "==2.2.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
|
||||
|
@ -227,11 +244,11 @@
|
|||
},
|
||||
"youtube-dl": {
|
||||
"hashes": [
|
||||
"sha256:c287ad8dd33471aabaabab5ab1dd825bebc70eb8b83ebfa93fd71022e01a1d08",
|
||||
"sha256:d414166efe52447877db06803816277f52f405faeee2bdf5ef816b30e352b3b8"
|
||||
"sha256:37972e16bb195a2cde7c0eebde1c650f3c17d3bc8020e49c512db79d6cfc31ae",
|
||||
"sha256:6f311ffaf8b88cdcf27a2301a2272455e213bdb780aa447246933a3da4532879"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.3.14"
|
||||
"version": "==2021.4.26"
|
||||
}
|
||||
},
|
||||
"develop": {}
|
||||
|
|
41
README.md
41
README.md
|
@ -19,36 +19,39 @@ DO_SPACES_SECRET=
|
|||
|
||||
## Running
|
||||
|
||||
There are several necessary command line flags:
|
||||
* `--sheet name` sets the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`.
|
||||
* `--url-col`, `--status-col`, `--archive-col`, and `--date-col` give the letter name of the column for the original source URL, the archiver status, the archive location, and the archive date respectively.
|
||||
There is just one necessary command line flag, `--sheet name` which the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns in the first row:
|
||||
* `Media URL` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
|
||||
* `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically.
|
||||
* `Archive location` (required): the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
|
||||
* `Archive date`: the date that the auto archiver script ran for this file
|
||||
* `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time)
|
||||
* `Duration`: the duration of the video
|
||||
* `Upload title`: the "title" of the video from the original source
|
||||
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
|
||||
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
|
||||
|
||||
For example, for use with this spreadsheet:
|
||||
|
||||

|
||||

|
||||
|
||||
```pipenv run python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D ```
|
||||
```pipenv run python auto-archive.py --sheet archiver-test```
|
||||
|
||||
The Youtube link is downloaded and archived, and the spreadsheet is updated to the following:
|
||||
When the auto archiver starts running, it updates the "Archive status" column.
|
||||
|
||||

|
||||

|
||||
|
||||
By default, the archiver will skip over live streaming content. However, with the `--streaming` flag, it will skip over non-real time content and archive livestreams. This is blocking, and each execution of the archiver will start downloading only a single livestreamed video. This is so that it can be used in combination with a non-streaming archiver, as detailed below.
|
||||
The links are downloaded and archived, and the spreadsheet is updated to the following:
|
||||
|
||||
Note that the first row is skipped, as it is assumed to be a header row. Rows with an empty URL column, or a non-empty archive column are also skipped.
|
||||

|
||||
|
||||
Finally, by default only the first worksheet in a Google Sheet is checked. To check all use the `--all-worksheets` flag. These worksheets must use the same column locations.
|
||||
Live streaming content is recorded in a separate thread.
|
||||
|
||||
Note that the first row is skipped, as it is assumed to be a header row. Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
|
||||
|
||||
## Automating
|
||||
|
||||
The auto-archiver can be run automatically via cron. To prevent overlapping execution (which should not affect archive integrity, but will use unecessary compute and network resources) `flock` can be used to create a lockfile. An example crontab entry that runs the archiver every minute is as follows.
|
||||
The auto-archiver can be run automatically via cron. An example crontab entry that runs the archiver every minute is as follows.
|
||||
|
||||
```* * * * * flock -w 0 archive.lock python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D```
|
||||
```* * * * * python auto-archive.py --sheet archiver-test```
|
||||
|
||||
Of course, additional logging information, etc. might be required.
|
||||
|
||||
With streaming mode enabled, the archiver can run safely at any frequency (since each iteration affects only a single row in the spreadsheet and it marks when streaming has started.) An example crontab line to run it every minute is as follows:
|
||||
|
||||
```* * * * * python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D --streaming```
|
||||
|
||||
When these two cronjobs are used together, the archiver should archive and store all media added to the Google Sheet every 60 seconds.
|
||||
With this configuration, the archiver should archive and store all media added to the Google Sheet every 60 seconds. Of course, additional logging information, etc. might be required.
|
||||
|
|
|
@ -2,6 +2,7 @@ import gspread
|
|||
import subprocess
|
||||
import argparse
|
||||
import auto_archive
|
||||
import datetime
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
|
@ -10,15 +11,16 @@ def main():
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(datetime.datetime.now())
|
||||
print("Opening document " + args.sheet)
|
||||
|
||||
gc = gspread.service_account()
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(args.sheet)
|
||||
|
||||
wks = sh.get_worksheet(0)
|
||||
values = wks.get_all_values()
|
||||
|
||||
for i in range(1, len(values)):
|
||||
for i in range(11, len(values)):
|
||||
sheet_name = values[i][0]
|
||||
|
||||
print("Processing " + sheet_name)
|
||||
|
|
Plik binarny nie jest wyświetlany.
Po Szerokość: | Wysokość: | Rozmiar: 486 KiB |
Plik binarny nie jest wyświetlany.
Po Szerokość: | Wysokość: | Rozmiar: 223 KiB |
Plik binarny nie jest wyświetlany.
Po Szerokość: | Wysokość: | Rozmiar: 241 KiB |
Ładowanie…
Reference in New Issue