Update auto archiver docs with new header declaration method

pull/9/head
Logan Williams 2021-05-12 09:01:45 +02:00
rodzic 866c4fa7fd
commit 339f62fade
7 zmienionych plików z 70 dodań i 47 usunięć

Wyświetl plik

@ -10,6 +10,7 @@ python-dotenv = "*"
youtube_dl = "*"
argparse = "*"
ffmpeg-python = "*"
beautifulsoup4 = "*"
[dev-packages]

69
Pipfile.lock wygenerowano
Wyświetl plik

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "a5308ff7514ddcff08e39ae06fe8f96e63ea3eecfbf4c106c907c8aa6b76b8dd"
"sha256": "117af2367c550817e099dd85a55aad372e90f7e8f8940fdc69a8f8d052c3ea7b"
},
"pipfile-spec": 6,
"requires": {
@ -24,29 +24,38 @@
"index": "pypi",
"version": "==1.4.0"
},
"boto3": {
"beautifulsoup4": {
"hashes": [
"sha256:d39c04b51e60197f5503f8489f043bc904981567cc8431d389367767dc3fd5ae",
"sha256:fe1898c5b10035528207995c9931b78f2f50bb70cf93bac353152aea47c04780"
"sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35",
"sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25",
"sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"
],
"index": "pypi",
"version": "==1.17.30"
"version": "==4.9.3"
},
"boto3": {
"hashes": [
"sha256:d856a71d74351649ca8dd59ad17c8c3e79ea57734ff4a38a97611e1e10b06863",
"sha256:da1b2c884dbf56cc3ece07940a7b654f41a93b9fc40ee1ed21a76da25a05989c"
],
"index": "pypi",
"version": "==1.17.62"
},
"botocore": {
"hashes": [
"sha256:63951595a736dfc9759f57e33bec6eaea4f09c4800626ef5309437060b263e48",
"sha256:98ff1eb210d394a1ffe736b33c8a7be68f30f0a03550b559c5bb6fdf0c29328d"
"sha256:e4f8cb923edf035c2ae5f6169c70e77e31df70b88919b92b826a6b9bd14511b1",
"sha256:f7c2c5c5ed5212b2628d8fb1c587b31c6e8d413ecbbd1a1cdf6f96ed6f5c8d5e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.20.30"
"version": "==1.20.62"
},
"cachetools": {
"hashes": [
"sha256:1d9d5f567be80f7c07d765e21b814326d78c61eb0c3a637dffc0e5d1796cb2e2",
"sha256:f469e29e7aa4cff64d8de4aad95ce76de8ea1125a16c68e0d93f65c3c3dc92e9"
"sha256:2cc0b89715337ab6dbba85b5b50effe2b0c74e035d83ee8ed637cf52f12ae001",
"sha256:61b5ed1e22a0924aed1d23b478f37e8d52549ff8a961de2909c69bf950020cff"
],
"markers": "python_version ~= '3.5'",
"version": "==4.2.1"
"version": "==4.2.2"
},
"certifi": {
"hashes": [
@ -80,19 +89,19 @@
},
"google-auth": {
"hashes": [
"sha256:9bd436d19ab047001a1340720d2b629eb96dd503258c524921ec2af3ee88a80e",
"sha256:dcaba3aa9d4e0e96fd945bf25a86b6f878fcb05770b67adbeb50a63ca4d28a5e"
"sha256:588bdb03a41ecb4978472b847881e5518b5d9ec6153d3d679aa127a55e13b39f",
"sha256:9ad25fba07f46a628ad4d0ca09f38dcb262830df2ac95b217f9b0129c9e42206"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==1.28.0"
"version": "==1.30.0"
},
"google-auth-oauthlib": {
"hashes": [
"sha256:54431535309cfab50897d9c181e8c2226268825aa6e42e930b05b99c5041a18c",
"sha256:dabffbf594a6be2fd6d054060846d1201569252efb10dfb749b504a7591f8af0"
"sha256:09832c6e75032f93818edf1affe4746121d640c625a5bef9b5c96af676e98eee",
"sha256:0e92aacacfb94978de3b7972cf4b0f204c3cd206f74ddd0dc0b31e91164e6317"
],
"markers": "python_version >= '3.6'",
"version": "==0.4.3"
"version": "==0.4.4"
},
"gspread": {
"hashes": [
@ -172,11 +181,11 @@
},
"python-dotenv": {
"hashes": [
"sha256:0c8d1b80d1a1e91717ea7d526178e3882732420b03f08afea0406db6402e220e",
"sha256:587825ed60b1711daea4832cf37524dfd404325b7db5e25ebe88c495c9f807a0"
"sha256:00aa34e92d992e9f8383730816359647f358f4a3be1ba45e5a5cefd27ee91544",
"sha256:b1ae5e9643d5ed987fc57cc2583021e38db531946518130777734f9589b3141f"
],
"index": "pypi",
"version": "==0.15.0"
"version": "==0.17.1"
},
"requests": {
"hashes": [
@ -204,10 +213,10 @@
},
"s3transfer": {
"hashes": [
"sha256:1e28620e5b444652ed752cf87c7e0cb15b0e578972568c6609f0f18212f259ed",
"sha256:7fdddb4f22275cf1d32129e21f056337fd2a80b6ccef1664528145b72c49e6d2"
"sha256:9b3752887a2880690ce628bc263d6d13a3864083aeacff4890c1c9839a5eb0bc",
"sha256:cb022f4b16551edebbb31a377d3f09600dbada7363d8c5db7976e7f47732e1b2"
],
"version": "==0.3.4"
"version": "==0.4.2"
},
"six": {
"hashes": [
@ -217,6 +226,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0"
},
"soupsieve": {
"hashes": [
"sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc",
"sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b"
],
"markers": "python_version >= '3.0'",
"version": "==2.2.1"
},
"urllib3": {
"hashes": [
"sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
@ -227,11 +244,11 @@
},
"youtube-dl": {
"hashes": [
"sha256:c287ad8dd33471aabaabab5ab1dd825bebc70eb8b83ebfa93fd71022e01a1d08",
"sha256:d414166efe52447877db06803816277f52f405faeee2bdf5ef816b30e352b3b8"
"sha256:37972e16bb195a2cde7c0eebde1c650f3c17d3bc8020e49c512db79d6cfc31ae",
"sha256:6f311ffaf8b88cdcf27a2301a2272455e213bdb780aa447246933a3da4532879"
],
"index": "pypi",
"version": "==2021.3.14"
"version": "==2021.4.26"
}
},
"develop": {}

Wyświetl plik

@ -19,36 +19,39 @@ DO_SPACES_SECRET=
## Running
There are several necessary command line flags:
* `--sheet name` sets the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`.
* `--url-col`, `--status-col`, `--archive-col`, and `--date-col` give the letter name of the column for the original source URL, the archiver status, the archive location, and the archive date respectively.
There is just one necessary command line flag, `--sheet name` which the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. This sheet must also have specific columns in the first row:
* `Media URL` (required): the location of the media to be archived. This is the only column that should be supplied with data initially
* `Archive status` (required): the status of the auto archiver script. Any row with text in this column will be skipped automatically.
* `Archive location` (required): the location of the archived version. For files that were not able to be auto archived, this can be manually updated.
* `Archive date`: the date that the auto archiver script ran for this file
* `Upload timestamp`: the timestamp extracted from the video. (For YouTube, this unfortunately does not currently include the time)
* `Duration`: the duration of the video
* `Upload title`: the "title" of the video from the original source
* `Thumbnail`: an image thumbnail of the video (resize row height to make this more visible)
* `Thumbnail index`: a link to a page that shows many thumbnails for the video, useful for quickly seeing video content
For example, for use with this spreadsheet:
![A screenshot of a Google Spreadsheet with a single Youtube URL in column A](docs/before.png)
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column](docs/demo-before.png)
```pipenv run python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D ```
```pipenv run python auto-archive.py --sheet archiver-test```
The Youtube link is downloaded and archived, and the spreadsheet is updated to the following:
When the auto archiver starts running, it updates the "Archive status" column.
![A screenshot of a Google Spreadsheet with the same Youtube URL in column A, and additional archival information in B, C, and D](docs/after.png)
![A screenshot of a Google Spreadsheet with column headers defined as above, and several Youtube and Twitter URLs in the "Media URL" column. The auto archiver has added "archive in progress" to one of the status columns.](docs/demo-progress.png)
By default, the archiver will skip over live streaming content. However, with the `--streaming` flag, it will skip over non-real time content and archive livestreams. This is blocking, and each execution of the archiver will start downloading only a single livestreamed video. This is so that it can be used in combination with a non-streaming archiver, as detailed below.
The links are downloaded and archived, and the spreadsheet is updated to the following:
Note that the first row is skipped, as it is assumed to be a header row. Rows with an empty URL column, or a non-empty archive column are also skipped.
![A screenshot of a Google Spreadsheet with videos archived and metadata added per the description of the columns above.](docs/demo-after.png)
Finally, by default only the first worksheet in a Google Sheet is checked. To check all use the `--all-worksheets` flag. These worksheets must use the same column locations.
Live streaming content is recorded in a separate thread.
Note that the first row is skipped, as it is assumed to be a header row. Rows with an empty URL column, or a non-empty archive column are also skipped. All sheets in the document will be checked.
## Automating
The auto-archiver can be run automatically via cron. To prevent overlapping execution (which should not affect archive integrity, but will use unecessary compute and network resources) `flock` can be used to create a lockfile. An example crontab entry that runs the archiver every minute is as follows.
The auto-archiver can be run automatically via cron. An example crontab entry that runs the archiver every minute is as follows.
```* * * * * flock -w 0 archive.lock python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D```
```* * * * * python auto-archive.py --sheet archiver-test```
Of course, additional logging information, etc. might be required.
With streaming mode enabled, the archiver can run safely at any frequency (since each iteration affects only a single row in the spreadsheet and it marks when streaming has started.) An example crontab line to run it every minute is as follows:
```* * * * * python auto-archive.py --sheet archiver-test --url-col A --status-col B --archive-col C --date-col D --streaming```
When these two cronjobs are used together, the archiver should archive and store all media added to the Google Sheet every 60 seconds.
With this configuration, the archiver should archive and store all media added to the Google Sheet every 60 seconds. Of course, additional logging information, etc. might be required.

Wyświetl plik

@ -2,6 +2,7 @@ import gspread
import subprocess
import argparse
import auto_archive
import datetime
def main():
parser = argparse.ArgumentParser(
@ -10,15 +11,16 @@ def main():
args = parser.parse_args()
print(datetime.datetime.now())
print("Opening document " + args.sheet)
gc = gspread.service_account()
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(args.sheet)
wks = sh.get_worksheet(0)
values = wks.get_all_values()
for i in range(1, len(values)):
for i in range(11, len(values)):
sheet_name = values[i][0]
print("Processing " + sheet_name)

BIN
docs/demo-after.png 100644

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 486 KiB

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 223 KiB

Plik binarny nie jest wyświetlany.

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 241 KiB