kopia lustrzana https://github.com/Michael-K-Stein/SpotiFile
Switch to generators for scrapers
rodzic
a4131dadf0
commit
d54485d025
|
@ -1,5 +1,6 @@
|
||||||
from threading import Thread, get_ident
|
from threading import Thread, get_ident
|
||||||
import pickle
|
import pickle
|
||||||
|
from typing import Generator
|
||||||
from spotify_client import SpotifyClient
|
from spotify_client import SpotifyClient
|
||||||
from spotify_scraper import SpotifyScraper
|
from spotify_scraper import SpotifyScraper
|
||||||
from config import *
|
from config import *
|
||||||
|
@ -8,6 +9,7 @@ from time import sleep
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import random
|
import random
|
||||||
from utils.utils import clean_file_path
|
from utils.utils import clean_file_path
|
||||||
|
from utils.spotify_track import SpotifyTrack
|
||||||
|
|
||||||
client = SpotifyClient(sp_key=SP_KEY, sp_dc=SP_DC)
|
client = SpotifyClient(sp_key=SP_KEY, sp_dc=SP_DC)
|
||||||
client.get_me()
|
client.get_me()
|
||||||
|
@ -48,7 +50,7 @@ class Console:
|
||||||
console = Console()
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
def download_track_list(download_dir: str, track_list: list, recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False, recursive_limit=1024):
|
def download_track_list(download_dir: str, track_list: Generator[SpotifyTrack, None, None], recursive_artist: bool=False, recursive_album: bool=False, recursive: bool=False, recursive_limit=1024):
|
||||||
global g_downloaded_songs, g_downloaded_artist_covers
|
global g_downloaded_songs, g_downloaded_artist_covers
|
||||||
my_thread_id = str(get_ident()).zfill(6)
|
my_thread_id = str(get_ident()).zfill(6)
|
||||||
artist_images_download_dir = f'{download_dir}/{settings.ARTIST_IMAGES_SUB_DIR}'
|
artist_images_download_dir = f'{download_dir}/{settings.ARTIST_IMAGES_SUB_DIR}'
|
||||||
|
@ -65,12 +67,8 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
|
||||||
track.download_to_file(scraper, track_path)
|
track.download_to_file(scraper, track_path)
|
||||||
console.happy(f'Thread<{my_thread_id}> | Downloaded: {track.preview_title()}')
|
console.happy(f'Thread<{my_thread_id}> | Downloaded: {track.preview_title()}')
|
||||||
g_downloaded_songs.append(track.spotify_id)
|
g_downloaded_songs.append(track.spotify_id)
|
||||||
if (recursive_album or recursive) and len(track_list) < recursive_limit:
|
if (recursive_album or recursive):
|
||||||
new_tracks = list(scraper.scrape_album_tracks(track.album.spotify_id))
|
download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(track.album.spotify_id), recursive=False)
|
||||||
for new_track in new_tracks:
|
|
||||||
if new_track not in track_list and len(track_list) < recursive_limit:
|
|
||||||
track_list.append(new_track)
|
|
||||||
console.log(f'Thread<{my_thread_id}> | Scraped {len(new_tracks)} new songs through recursive album!')
|
|
||||||
|
|
||||||
for artist in track.artists:
|
for artist in track.artists:
|
||||||
if artist.spotify_id not in g_downloaded_artist_covers:
|
if artist.spotify_id not in g_downloaded_artist_covers:
|
||||||
|
@ -83,19 +81,16 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
|
||||||
console.error(str(ex))
|
console.error(str(ex))
|
||||||
g_downloaded_artist_covers.append(artist.spotify_id)
|
g_downloaded_artist_covers.append(artist.spotify_id)
|
||||||
|
|
||||||
if (recursive_artist or recursive) and len(track_list) < recursive_limit:
|
if (recursive_artist or recursive):
|
||||||
old_size = len(track_list)
|
download_track_list(download_dir=download_dir, track_list=scraper.scrape_artist_tracks(track.artist.spotify_id), recursive=False)
|
||||||
track_list += list(scraper.scrape_artist_tracks(artist.spotify_id))
|
|
||||||
if recursive_artist:
|
if recursive_artist:
|
||||||
albums = list(scraper.scrape_artist_albums(artist.spotify_id))
|
for album in scraper.scrape_artist_albums(artist.spotify_id):
|
||||||
for album in albums:
|
download_track_list(download_dir=download_dir, track_list=scraper.scrape_album_tracks(album['id']), recursive=False)
|
||||||
track_list += list(scraper.scrape_album_tracks(album['id']))
|
|
||||||
console.log(f'Thread<{my_thread_id}> | Scraped {len(track_list) - old_size} new songs through recursive artist!')
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
console.error(f'Thread<{my_thread_id}> | Exception: {ex}')
|
console.error(f'Thread<{my_thread_id}> | Exception: {ex}')
|
||||||
downloaded_count += 1
|
downloaded_count += 1
|
||||||
if settings.VERBOSE_OUTPUTS:
|
if settings.VERBOSE_OUTPUTS:
|
||||||
console.log(f'Thread<{my_thread_id}> | Processed {downloaded_count} / {len(track_list)}')
|
console.log(f'Thread<{my_thread_id}> | Processed {downloaded_count} tracks')
|
||||||
|
|
||||||
|
|
||||||
def save_globals_save_file():
|
def save_globals_save_file():
|
||||||
|
@ -133,18 +128,7 @@ def full_download(download_dir: str, identifier: str, recursive_artist: bool=Fal
|
||||||
|
|
||||||
client.refresh_tokens()
|
client.refresh_tokens()
|
||||||
console.log(f'Recieved scrape command on identifier: {identifier}, {recursive=}, {recursive_artist=}, {recursive_album=}, {recursive_limit=}, {thread_count=}')
|
console.log(f'Recieved scrape command on identifier: {identifier}, {recursive=}, {recursive_artist=}, {recursive_album=}, {recursive_limit=}, {thread_count=}')
|
||||||
download_threads = []
|
download_track_list(download_dir=download_dir, track_list=scraper.scrape_tracks(identifier, console=console), recursive=recursive, recursive_album=recursive_album, recursive_artist=recursive_artist, recursive_limit=recursive_limit)
|
||||||
track_list = []
|
|
||||||
for track in scraper.scrape_tracks(identifier, console=console):
|
|
||||||
track_list.append(track)
|
|
||||||
if len(track_list) == recursive_limit / thread_count:
|
|
||||||
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
|
|
||||||
download_threads[-1].start()
|
|
||||||
sleep(0.05)
|
|
||||||
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
|
|
||||||
download_threads[-1].start()
|
|
||||||
|
|
||||||
[x.join() for x in download_threads]
|
|
||||||
|
|
||||||
console.log(f'Comletely done scraping identifier: {identifier}!')
|
console.log(f'Comletely done scraping identifier: {identifier}!')
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import Generator
|
||||||
from config import *
|
from config import *
|
||||||
from utils.spotify_track import SpotifyTrack
|
from utils.spotify_track import SpotifyTrack
|
||||||
|
from utils.spotify_album import SpotifyAlbum
|
||||||
from utils.spotify_playlist import SpotifyPlaylist
|
from utils.spotify_playlist import SpotifyPlaylist
|
||||||
from utils.spotify_category import SpotifyCategory
|
from utils.spotify_category import SpotifyCategory
|
||||||
from spotify_client import SpotifyClient
|
from spotify_client import SpotifyClient
|
||||||
|
@ -39,7 +41,7 @@ class SpotifyScraper:
|
||||||
def extract_id_from_link(self, link: str) -> str:
|
def extract_id_from_link(self, link: str) -> str:
|
||||||
return link[link.rindex('/') + 1:]
|
return link[link.rindex('/') + 1:]
|
||||||
|
|
||||||
def scrape_tracks(self, link: str, console=None) -> list[SpotifyTrack]:
|
def scrape_tracks(self, link: str, console=None) -> Generator[SpotifyTrack, None, None]:
|
||||||
id_type = self.identify_link_type(link)
|
id_type = self.identify_link_type(link)
|
||||||
if id_type == self.IDTypes.Playlist:
|
if id_type == self.IDTypes.Playlist:
|
||||||
return self.scrape_playlist_tracks(self.extract_id_from_link(link))
|
return self.scrape_playlist_tracks(self.extract_id_from_link(link))
|
||||||
|
@ -80,7 +82,7 @@ class SpotifyScraper:
|
||||||
def scrape_album(self, album_id: str):
|
def scrape_album(self, album_id: str):
|
||||||
return self._client.get(f'https://api.spotify.com/v1/albums/{album_id}').json()
|
return self._client.get(f'https://api.spotify.com/v1/albums/{album_id}').json()
|
||||||
|
|
||||||
def scrape_album_tracks(self, album_id: str):
|
def scrape_album_tracks(self, album_id: str) -> Generator[SpotifyTrack, None, None]:
|
||||||
limit = 50
|
limit = 50
|
||||||
offset = 0
|
offset = 0
|
||||||
ret = self._client.get(f'https://api.spotify.com/v1/albums/{album_id}/tracks?limit={limit}').json()
|
ret = self._client.get(f'https://api.spotify.com/v1/albums/{album_id}/tracks?limit={limit}').json()
|
||||||
|
@ -95,34 +97,30 @@ class SpotifyScraper:
|
||||||
def scrape_artist(self, artist_id: str):
|
def scrape_artist(self, artist_id: str):
|
||||||
return self.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=from_token').json()
|
return self.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=from_token').json()
|
||||||
|
|
||||||
def scrape_artist_albums(self, artist_id: str):
|
def scrape_artist_albums(self, artist_id: str) -> Generator[SpotifyAlbum, None, None]:
|
||||||
offset = 0
|
offset = 0
|
||||||
limit = 50
|
limit = 50
|
||||||
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
|
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
|
||||||
albums = albums_data['items']
|
for album in albums_data['items']:
|
||||||
|
yield SpotifyAlbum(album)
|
||||||
while albums_data['next'] is not None:
|
while albums_data['next'] is not None:
|
||||||
offset += limit
|
offset += limit
|
||||||
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
|
albums_data = self.get(f'https://api.spotify.com/v1/artists/{artist_id}/albums?market=from_token&limit={limit}&offset={offset}').json()
|
||||||
albums += albums_data['items']
|
for album in albums_data['items']:
|
||||||
return albums
|
yield SpotifyAlbum(album)
|
||||||
|
|
||||||
def scrape_artist_tracks(self, artist_id: str, intense:bool=False, console=None):
|
def scrape_artist_tracks(self, artist_id: str, intense:bool=False, console=None) -> Generator[SpotifyTrack, None, None]:
|
||||||
tracks = self.scrape_artist(artist_id)['tracks']
|
tracks = self.scrape_artist(artist_id)['tracks']
|
||||||
try:
|
try:
|
||||||
artist_name = tracks[0]['album']['artists'][0]['name']
|
artist_name = tracks[0]['album']['artists'][0]['name']
|
||||||
except:
|
except:
|
||||||
artist_name = 'Unknown'
|
artist_name = 'Unknown'
|
||||||
proccessed_tracks = [SpotifyTrack(track_data) for track_data in tracks]
|
for track_data in tracks:
|
||||||
yield proccessed_tracks
|
yield SpotifyTrack(track_data)
|
||||||
if intense:
|
if intense:
|
||||||
albums = self.scrape_artist_albums(artist_id)
|
for album in self.scrape_artist_albums(artist_id):
|
||||||
proccessed_album_count = 0
|
for track in self.scrape_album_tracks(album.spotify_id):
|
||||||
for album in albums:
|
|
||||||
for track in self.scrape_album_tracks(album['id']):
|
|
||||||
yield track
|
yield track
|
||||||
proccessed_album_count += 1
|
|
||||||
if console is not None:
|
|
||||||
console.log(f'Scraping {artist_name}\'s albums: {proccessed_album_count} / {len(albums)}')
|
|
||||||
|
|
||||||
def get(self, url: str) -> Response:
|
def get(self, url: str) -> Response:
|
||||||
return self._client.get(url)
|
return self._client.get(url)
|
||||||
|
@ -180,7 +178,7 @@ class SpotifyScraper:
|
||||||
tracks = self.scrape_playlist_tracks(playlist_id)
|
tracks = self.scrape_playlist_tracks(playlist_id)
|
||||||
return SpotifyPlaylist(spotify_id=playlist_id, tracks=tracks, data=playlist_data)
|
return SpotifyPlaylist(spotify_id=playlist_id, tracks=tracks, data=playlist_data)
|
||||||
|
|
||||||
def scrape_user_items(self, user_id: str, limit:int=50) -> list[SpotifyTrack]:
|
def scrape_user_items(self, user_id: str, limit:int=50) -> Generator[SpotifyTrack, None, None]:
|
||||||
has_next = True
|
has_next = True
|
||||||
user_playlists = []
|
user_playlists = []
|
||||||
while has_next:
|
while has_next:
|
||||||
|
|
Ładowanie…
Reference in New Issue