download twitter media with aiohttp.

master
stockbsd 2019-10-24 11:27:36 +08:00
rodzic 38d8025df6
commit dbb0a55e36
9 zmienionych plików z 569 dodań i 0 usunięć

42
README.md 100644
Wyświetl plik

@ -0,0 +1,42 @@
# Download twitter resources
Download tweet images and videos. Run threads which has a event loop to download resources asynchronously.
```
pip3 install twitter-dl
```
```
usage: twitter-dl [-h] [-c CONFIDENTIAL]
[-s {large,medium,small,thumb,orig}]
[--tweet] [--video] [--nophoto]
[-l LIMIT] [--rts]
[--thread-number THREAD_NUMBER]
[--coro-number CORO_NUMBER]
[--since SID]
resource_id dest
Download all images uploaded by a twitter user you specify
positional arguments:
resource_id An ID of a twitter user. Also accept tweet url or
tweet id.
dest Specify where to put images
optional arguments:
-h, --help show this help message and exit
-c CONFIDENTIAL, --confidential CONFIDENTIAL
a json file containing a key and a secret
-s {large,medium,small,thumb,orig}, --size {large,medium,small,thumb,orig}
specify the size of images
--tweet indicate you gived a tweet url or tweet id
--video include video
--nophoto exclude photo
-l LIMIT, --limit LIMIT
the maximum number of tweets to check (most recent
first)
--rts save images contained in retweets
--thread-number THREAD_NUMBER
--coro-number CORO_NUMBER
--since SID
```

Wyświetl plik

@ -0,0 +1,6 @@
{
"access_token": "",
"access_token_secret": "",
"consumer_key": "",
"consumer_secret": ""
}

2
requirements.txt 100644
Wyświetl plik

@ -0,0 +1,2 @@
requests>=2.20.0
aiohttp>=3.4.4

48
setup.py 100644
Wyświetl plik

@ -0,0 +1,48 @@
# Always prefer setuptools over distutils
from setuptools import setup, find_packages
import os
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here,'requirements.txt')) as fh:
requirements = [line.strip() for line in fh.readlines()]
with open(os.path.join(here,'README.md')) as fh:
Readme = fh.read()
def get_version():
version_file = os.path.join(here, "twitter_dl", "__init__.py")
for line in open(version_file):
if line.startswith("version"):
version = line.split("=")[1].strip().replace("'", "").replace('"', '')
return version
raise RuntimeError("Unable to find version string in %s" % version_file)
name = "twitter-dl"
git_repo = "https://github.com/stockbsd/{}".format(name)
setup(
name=name,
version=get_version(),
description="Download tweet images and videos",
long_description=Readme,
long_description_content_type="text/markdown",
install_requires=requirements,
packages=find_packages(exclude=["contrib", "docs", "tests"]),
entry_points={
"console_scripts": [
"twitter-dl=twitter_dl.__main__:main"
]
},
url=git_repo,
author="stockbsd",
author_email="stockbsd@gmail.com",
keywords="twitter",
project_urls={"Bug Reports": git_repo, "Source": git_repo},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)

Wyświetl plik

@ -0,0 +1,3 @@
from .downloader import Downloader
version = "0.1.3"

Wyświetl plik

@ -0,0 +1,104 @@
import os
import argparse
import json
import re
import logging
from . import Downloader
from .exceptions import *
def main():
DEBUG = os.getenv("DEBUG")
logging.basicConfig(level=logging.DEBUG if DEBUG else logging.INFO,
format='%(levelname)-7s %(name)11s: %(message)s')
parser = argparse.ArgumentParser(
description="Download all images uploaded by a twitter user you specify"
)
parser.add_argument(
"resource_id",
help="An ID of a twitter user. Also accept tweet url or tweet id.",
)
parser.add_argument("dest", help="Specify where to put images")
parser.add_argument(
"-c",
"--confidential",
help="a json file containing a key and a secret",
default=os.getenv("TWITTER_AUTH", os.path.expanduser("~/.twitter.json")),
)
parser.add_argument(
"-s",
"--size",
help="specify the size of images",
default="orig",
choices=["large", "medium", "small", "thumb", "orig"],
)
parser.add_argument(
"--tweet",
help="indicate you gived a tweet url or tweet id",
default=False,
action="store_true",
)
parser.add_argument(
"--list",
help="indicate you gived a list by user:list",
default=False,
action="store_true",
)
parser.add_argument(
"--video", help="include video", default=False, action="store_true"
)
parser.add_argument(
"--nophoto", dest="photo", help="exclude photo", action="store_false"
)
parser.add_argument(
"-l",
"--limit",
type=int,
help="the maximum number of tweets to check (most recent first)",
default=3200,
)
parser.add_argument(
"--since",
type=int,
help="the min id of tweets to check (most recent first)",
default=0,
)
parser.add_argument(
"--rts", help="save images contained in retweets", action="store_true"
)
parser.add_argument("--thread-number", type=int, default=2)
parser.add_argument("--coro-number", type=int, default=5)
args = parser.parse_args()
if args.confidential:
with open(args.confidential) as f:
confidential = json.loads(f.read())
if "consumer_key" not in confidential or "consumer_secret" not in confidential:
raise ConfidentialsNotSuppliedError()
api_key = confidential["consumer_key"]
api_secret = confidential["consumer_secret"]
else:
raise ConfidentialsNotSuppliedError(args.confidential)
downloader = Downloader(api_key, api_secret, args.thread_number, args.coro_number)
if args.tweet:
downloader.download_media_of_tweet(args.resource_id, args.dest, args.size, args.video,
args.photo)
downloader.d.join()
elif args.list:
username, listname = args.resource_id.split(':')
downloader.download_media_of_list(username, listname, args.dest, args.size,
args.limit, args.rts, args.video, args.photo, args.since)
downloader.d.join()
else:
downloader.download_media_of_user(args.resource_id, args.dest, args.size,
args.limit, args.rts, args.video, args.photo, args.since)
downloader.d.join()
#print('finished!')
if __name__ == "__main__":
main()

Wyświetl plik

@ -0,0 +1,250 @@
import os
import sys
import logging
import base64
import json
import requests
from .exceptions import *
from .threaded_aio_dlder import AioDownloader
def ensure_dir(directory):
directory = os.path.abspath(directory)
if not os.path.exists(directory):
os.makedirs(directory, exist_ok=True)
return directory
class Downloader:
def __init__(self, api_key, api_secret, thread_number=2, coro_number=5):
self.log = logging.getLogger("downloader")
self.bearer_token = self.bearer(api_key, api_secret)
self.log.info("Bearer token is " + self.bearer_token)
self.d = AioDownloader()
self.d.start(thread_number, coro_number)
def bearer(self, key, secret):
"""Receive the bearer token and return it.
Args:
key: API key.
secret: API string.
"""
# setup
credential = base64.b64encode(
bytes("{}:{}".format(key, secret), "utf-8")
).decode()
url = "https://api.twitter.com/oauth2/token"
headers = {
"Authorization": "Basic {}".format(credential),
"Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
}
payload = {"grant_type": "client_credentials"}
# post the request
r = requests.post(url, headers=headers, params=payload)
# check the response
if r.status_code == 200:
return r.json()["access_token"]
else:
raise BearerTokenNotFetchedError()
def download_media_of_tweet(self, tid, save_dest, size="large", include_video=False,
include_photo=True):
''' '''
save_dest = ensure_dir(save_dest)
tweet = self.get_tweet(tid)
self.process_tweet(tweet, save_dest, size, include_video, include_photo)
def download_media_of_user(self, user, save_dest, size="large", limit=3200, rts=False,
include_video=False, include_photo=True, since_id=0):
"""Download and save images that user uploaded.
Args:
user: User ID.
save_dest: The directory where images will be saved.
size: Which size of images to download.
rts: Whether to include retweets or not.
"""
save_dest = ensure_dir(save_dest)
alltweets = self.get_user_tweets(user, None, limit, rts, since_id)
for tweet in alltweets:
self.process_tweet(tweet, save_dest, include_video=include_video, include_photo=include_photo)
def download_media_of_list(self, user, listname, save_dest, size="large", limit=3200,
rts=False, include_video=False, include_photo=True, since_id=0):
"""Download and save images of a list.
Args:
user: list owner name.
listname: list slug
save_dest: The directory where images will be saved.
size: Which size of images to download.
rts: Whether to include retweets or not.
"""
save_dest = ensure_dir(save_dest)
alltweets = self.get_list_tweets(user, listname, None, limit, rts, since_id)
for tweet in alltweets:
self.process_tweet(tweet, save_dest, include_video=include_video, include_photo=include_photo)
def api_fetch_tweets(self, url, payload, start, count, rts, since_id):
# setup
bearer_token = self.bearer_token
headers = {"Authorization": "Bearer {}".format(bearer_token)}
payload["count"]= count
payload["include_rts"] = rts
if start:
payload["max_id"] = start - 1 #max_id is inclusive
if since_id:
payload["since_id"] = since_id #since_id is exclusive
alltweets = []
while True:
# get the request
r = requests.get(url, headers=headers, params=payload)
# check the response
tweets = []
if r.status_code == 200:
tweets = r.json()
else:
self.log.error(f"An error occurred with the request, status code was {r.status_code}")
if not tweets:
break
alltweets.extend(tweets)
payload["max_id"] = tweets[-1]['id'] - 1
payload['count'] = count - len(alltweets)
if len(alltweets) >= count:
#self.log.info(f" the number of tweets {len(alltweets)} checked reach the limit {count}")
break
if len(tweets) < 200: # No more tweets left:200 is the twitter-api limit
break
self.log.info(f"Got {len(alltweets)} tweets")
return alltweets
def get_user_tweets(self, user, start=None, count=200, rts=False, since_id=0):
"""Download user's tweets and return them as a list.
Args:
user: User ID.
start: Tweet ID.
rts: Whether to include retweets or not.
"""
apiurl = "https://api.twitter.com/1.1/statuses/user_timeline.json"
payload = {"screen_name": user}
return self.api_fetch_tweets(apiurl, payload, start, count, rts, since_id)
def get_list_tweets(self, username, listname, start=None, count=200, rts=False, since_id=0):
"""Download user's tweets and return them as a list.
Args:
user: User ID.
start: Tweet ID.
rts: Whether to include retweets or not.
"""
apiurl = "https://api.twitter.com/1.1/lists/statuses.json"
payload = {"owner_screen_name": username, "slug":listname}
return self.api_fetch_tweets(apiurl, payload, start, count, rts, since_id)
def get_tweet(self, id):
"""Download single tweet
Args:
id: Tweet ID.
"""
bearer_token = self.bearer_token
url = "https://api.twitter.com/1.1/statuses/show.json"
headers = {"Authorization": f"Bearer {bearer_token}"}
payload = {"id": id, "include_entities": "true"}
# get the request
r = requests.get(url, headers=headers, params=payload)
# check the response
if r.status_code == 200:
tweet = r.json()
self.log.info(f"Got tweet with id {id} of user @{tweet['user']['name']}")
return tweet
else:
self.log.error(f"An error occurred , status code was {r.status_code}")
return None
def process_tweet(self, tweet, save_dest, size="large", include_video=False, include_photo=True):
if 'retweeted_status' in tweet:
tweet = tweet['retweeted_status']
self.log.debug('this is a retweet, turn to orignal tweet')
id_str = tweet["id_str"]
# save the image
images = self.extract_media_list(tweet, include_video, include_photo)
for i, image in enumerate(images, 1):
self.save_media(image, save_dest, f"{id_str}-{i}", size)
return len(images)
def extract_media_list(self, tweet, include_video, include_photo):
"""Return the url of the image embedded in tweet.
Args:
tweet: A dict object representing a tweet.
"""
extended = tweet.get("extended_entities")
if not extended and ("quoted_status" in tweet):
extended = tweet['quoted_status'].get("extended_entities")
self.log.debug('Extract media from quoted')
if not extended:
return []
rv = []
if "media" in extended:
for x in extended["media"]:
if x["type"] == "photo" and include_photo:
url = x["media_url"]
rv.append(url)
elif x["type"] in ["video", "animated_gif"] and include_video:
variants = x["video_info"]["variants"]
variants.sort(key=lambda x: x.get("bitrate", 0))
url = variants[-1]["url"].rsplit("?tag")[0]
rv.append(url)
return rv
def save_media(self, image, path, name, size="large"):
"""Download and save an image to path.
Args:
image: The url of the image.
path: The directory where the image will be saved.
name: It is used for naming the image.
size: Which size of images to download.
"""
if image:
# image's path with a new name
ext = os.path.splitext(image)[1]
save_file = os.path.join(path, name + ext)
if ext not in [".mp4"]:
real_url = image + ":" + size
else:
real_url = image
# save the image in the specified directory (or don't)
#ensure_dir(save_file)
if not (os.path.exists(save_file)):
self.d.add_url(real_url, save_file)
else:
self.log.info(f"Skipping downloaded {image}")

Wyświetl plik

@ -0,0 +1,14 @@
class Error(Exception):
'''Base-class for all exceptions raised by this module.'''
class ConfidentialsNotSuppliedError(Error):
'''An API key and an API sectret must be supplied.'''
class BearerTokenNotFetchedError(Error):
'''Couldn't fetch the bearer token.'''
class InvalidDownloadPathError(Error):
'''Download path must be a directory.'''

Wyświetl plik

@ -0,0 +1,100 @@
import asyncio
import aiohttp
import threading
import logging
from queue import Queue, Empty
# threaded asyncio
def loop_in_thread(async_entry, *args, **kwargs):
log = logging.getLogger('LoopThread')
log.debug('loop begin...')
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(async_entry(*args, **kwargs))
loop.close()
log.debug('loop end...')
class AioDownloader():
def __init__(self):
self.q = Queue()
self.threads = []
self.log = logging.getLogger('AioDlder')
def start(self, num_threads, num_coros):
for _ in range(num_threads):
t = threading.Thread(target=loop_in_thread,
args=(self.sched_downloaders, num_coros))
self.threads.append(t)
t.start()
def join(self):
self.add_endsignal()
for t in self.threads:
t.join()
def add_endsignal(self):
self.q.put((None, None))
def add_url(self, url, dest):
self.q.put((url, dest))
async def downloader(self, session, url, dest, sem):
try:
#now = loop.time()
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.read()
with open(dest, 'wb') as f:
f.write(data)
#self.log.info(f'{resp.url} ==> {dest}, {loop.time()-now:.2f}s used')
self.log.info(f'{resp.url} ==> {dest}')
else:
self.log.warning(f'{resp.url} status = {resp.status}')
except Exception as e:
self.log.warning(f'{url} failed: {e}')
finally:
sem.release()
# async entry point
async def sched_downloaders(self, num_coros):
loop = asyncio.get_event_loop() #prefer get_running_loop in py>=3.7
sem = asyncio.Semaphore(num_coros)
async with aiohttp.ClientSession(loop=loop) as session:
tasks = []
while True:
await sem.acquire()
url, dest = self.q.get(True) #block
if url is None:
self.add_endsignal() #notify peer threads
break
else:
tasks.append(loop.create_task(self.downloader(session, url, dest, sem)))
# waiting for downloader tasks to finish
await asyncio.gather(*[t for t in tasks if not t.done()])
self.log.info('Queue Finished')
if "__main__" == __name__:
import time, sys
logging.basicConfig(
level=logging.INFO,
format='%(threadName)10s %(name)12s: %(message)s',
stream=sys.stderr,
)
dld = AioDownloader()
for i in range(30):
dld.add_url(f"http://httpbin.org/delay/1?a={i}", '/dev/null')
dld.add_endsignal()
t0 = time.time()
log = logging.getLogger('main')
log.info('start worker threads')
dld.start(2, 5)
dld.join()
log.info('all workers exit')
print(f'{time.time()-t0} seconds')