Switch to generators for scrapers

dev
Michael Kuperfish Steinberg 2023-01-21 11:58:07 +02:00
rodzic a4131dadf0
commit d54485d025
2 zmienionych plików z 26 dodań i 44 usunięć

Wyświetl plik

@ -1,5 +1,6 @@
from threading import Thread, get_ident
import pickle
from typing import Generator
from spotify_client import SpotifyClient
from spotify_scraper import SpotifyScraper
from config import *
@ -8,6 +9,7 @@ from time import sleep
from datetime import datetime
import random
from utils.utils import clean_file_path
from utils.spotify_track import SpotifyTrack
client = SpotifyClient(sp_key=SP_KEY, sp_dc=SP_DC)
client.get_me()
@ -48,7 +50,7 @@ class Console:
console = Console()
def download_track_list(download_dir: str, track_list: list, recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False, recursive_limit=1024):
def download_track_list(download_dir: str, track_list: Generator[SpotifyTrack, None, None], recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False, recursive_limit=1024):
global g_downloaded_songs, g_downloaded_artist_covers
my_thread_id = str(get_ident()).zfill(6)
artist_images_download_dir = f'{download_dir}/{settings.ARTIST_IMAGES_SUB_DIR}'
@ -65,12 +67,8 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
track.download_to_file(scraper, track_path)
console.happy(f'Thread<{my_thread_id}> | Downloaded: {track.preview_title()}')
g_downloaded_songs.append(track.spotify_id)
if (recursive_album or recursive) and len(track_list) < recursive_limit:
new_tracks = list(scraper.scrape_album_tracks(track.album.spotify_id))
for new_track in new_tracks:
if new_track not in track_list and len(track_list) < recursive_limit:
track_list.append(new_track)
console.log(f'Thread<{my_thread_id}> | Scraped {len(new_tracks)} new songs through recursive album!')
if (recursive_album or recursive):
download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(track.album.spotify_id), recursive=False)
for artist in track.artists:
if artist.spotify_id not in g_downloaded_artist_covers:
@ -83,19 +81,16 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
console.error(str(ex))
g_downloaded_artist_covers.append(artist.spotify_id)
if (recursive_artist or recursive) and len(track_list) < recursive_limit:
old_size = len(track_list)
track_list += list(scraper.scrape_artist_tracks(artist.spotify_id))
if (recursive_artist or recursive):
download_track_list(download_dir=download_dir, track_list=scraper.scrape_artist_tracks(track.artist.spotify_id), recursive=False)
if recursive_artist:
albums = list(scraper.scrape_artist_albums(artist.spotify_id))
for album in albums:
track_list += list(scraper.scrape_album_tracks(album['id']))
console.log(f'Thread<{my_thread_id}> | Scraped {len(track_list) - old_size} new songs through recursive artist!')
for album in scraper.scrape_artist_albums(artist.spotify_id):
download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(album['id']), recursive=False)
except Exception as ex:
console.error(f'Thread<{my_thread_id}> | Exception: {ex}')
downloaded_count += 1
if settings.VERBOSE_OUTPUTS:
console.log(f'Thread<{my_thread_id}> | Processed {downloaded_count} / {len(track_list)}')
console.log(f'Thread<{my_thread_id}> | Processed {downloaded_count} tracks')
def save_globals_save_file():
@ -133,18 +128,7 @@ def full_download(download_dir: str, identifier: str, recursive_artist: bool=Fal
client.refresh_tokens()
console.log(f'Recieved scrape command on identifier: {identifier}, {recursive=}, {recursive_artist=}, {recursive_album=}, {recursive_limit=}, {thread_count=}')
download_threads = []
track_list = []
for track in scraper.scrape_tracks(identifier, console=console):
track_list.append(track)
if len(track_list) == recursive_limit / thread_count:
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
download_threads[-1].start()
sleep(0.05)
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
download_threads[-1].start()
[x.join() for x in download_threads]
download_track_list(download_dir=download_dir, track_list=scraper.scrape_tracks(identifier, console=console), recursive=recursive, recursive_album=recursive_album, recursive_artist=recursive_artist, recursive_limit=recursive_limit)
console.log(f'Comletely done scraping identifier: {identifier}!')

Wyświetl plik

@ -1,6 +1,8 @@
from enum import Enum
from typing import Generator
from config import *
from utils.spotify_track import SpotifyTrack
from utils.spotify_album import SpotifyAlbum
from utils.spotify_playlist import SpotifyPlaylist
from utils.spotify_category import SpotifyCategory
from spotify_client import SpotifyClient
@ -39,7 +41,7 @@ class SpotifyScraper:
def extract_id_from_link(self, link: str) -> str:
return link[link.rindex('/') + 1:]
def scrape_tracks(self, link: str, console=None) -> list[SpotifyTrack]:
def scrape_tracks(self, link: str, console=None) -> Generator[SpotifyTrack, None, None]:
id_type = self.identify_link_type(link)
if id_type == self.IDTypes.Playlist:
return self.scrape_playlist_tracks(self.extract_id_from_link(link))
@ -80,7 +82,7 @@ class SpotifyScraper:
def scrape_album(self, album_id: str):
return self._client.get(f'https://api.spotify.com/v1/albums/{album_id}').json()
def scrape_album_tracks(self, album_id: str):
def scrape_album_tracks(self, album_id: str) -> Generator[SpotifyTrack, None, None]:
limit = 50
offset = 0
ret = self._client.get(f'https://api.spotify.com/v1/albums/{album_id}/tracks?limit={limit}').json()
@ -95,34 +97,30 @@ class SpotifyScraper:
def scrape_artist(self, artist_id: str):
return self.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=from_token').json()
def scrape_artist_albums(self, artist_id: str):
def scrape_artist_albums(self, artist_id: str) -> Generator[SpotifyAlbum, None, None]:
offset = 0
limit = 50
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
albums = albums_data['items']
for album in albums_data['items']:
yield SpotifyAlbum(album)
while albums_data['next'] is not None:
offset += limit
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
albums += albums_data['items']
return albums
for album in albums_data['items']:
yield SpotifyAlbum(album)
def scrape_artist_tracks(self, artist_id: str, intense:bool=False, console=None):
def scrape_artist_tracks(self, artist_id: str, intense:bool=False, console=None) -> Generator[SpotifyTrack, None, None]:
tracks = self.scrape_artist(artist_id)['tracks']
try:
artist_name = tracks[0]['album']['artists'][0]['name']
except:
artist_name = 'Unknown'
proccessed_tracks = [SpotifyTrack(track_data) for track_data in tracks]
yield proccessed_tracks
for track_data in tracks:
yield SpotifyTrack(track_data)
if intense:
albums = self.scrape_artist_albums(artist_id)
proccessed_album_count = 0
for album in albums:
for track in self.scrape_album_tracks(album['id']):
for album in self.scrape_artist_albums(artist_id):
for track in self.scrape_album_tracks(album.spotify_id):
yield track
proccessed_album_count += 1
if console is not None:
console.log(f'Scraping {artist_name}\'s albums: {proccessed_album_count} / {len(albums)}')
def get(self, url: str) -> Response:
return self._client.get(url)
@ -180,7 +178,7 @@ class SpotifyScraper:
tracks = self.scrape_playlist_tracks(playlist_id)
return SpotifyPlaylist(spotify_id=playlist_id, tracks=tracks, data=playlist_data)
def scrape_user_items(self, user_id: str, limit:int=50) -> list[SpotifyTrack]:
def scrape_user_items(self, user_id: str, limit:int=50) -> Generator[SpotifyTrack, None, None]:
has_next = True
user_playlists = []
while has_next: