diff --git a/api/config/settings/common.py b/api/config/settings/common.py index 675d3e8ce..9ba1ecd09 100644 --- a/api/config/settings/common.py +++ b/api/config/settings/common.py @@ -1309,3 +1309,6 @@ IGNORE_FORWARDED_HOST_AND_PROTO = env.bool( """ Use :attr:`FUNKWHALE_HOSTNAME` and :attr:`FUNKWHALE_PROTOCOL ` instead of request header. """ + +HASHING_ALGORITHM = "sha256" +HASHING_CHUNK_SIZE = 1024 * 100 diff --git a/api/funkwhale_api/common/utils.py b/api/funkwhale_api/common/utils.py index 0a38ef05f..2a91524d5 100644 --- a/api/funkwhale_api/common/utils.py +++ b/api/funkwhale_api/common/utils.py @@ -1,4 +1,5 @@ import datetime +import hashlib from django.core.files.base import ContentFile from django.http import request @@ -458,3 +459,19 @@ def monkey_patch_request_build_absolute_uri(): request.HttpRequest.scheme = property(scheme) request.HttpRequest.get_host = get_host + + +def get_file_hash(file, algo=None, chunk_size=None, full_read=False): + algo = algo or settings.HASHING_ALGORITHM + chunk_size = chunk_size or settings.HASHING_CHUNK_SIZE + handler = getattr(hashlib, algo) + hash = handler() + file.seek(0) + if full_read: + for byte_block in iter(lambda: file.read(chunk_size), b""): + hash.update(byte_block) + else: + # sometimes, it's useful to only hash the beginning of the file, e.g + # to avoid a lot of I/O when crawling large libraries + hash.update(file.read(chunk_size)) + return "{}:{}".format(algo, hash.hexdigest()) diff --git a/api/funkwhale_api/music/management/commands/fix_uploads.py b/api/funkwhale_api/music/management/commands/fix_uploads.py index 582a837c4..02f3d2232 100644 --- a/api/funkwhale_api/music/management/commands/fix_uploads.py +++ b/api/funkwhale_api/music/management/commands/fix_uploads.py @@ -2,6 +2,7 @@ from django.core.management.base import BaseCommand from django.db import transaction from django.db.models import Q +from funkwhale_api.common import utils as common_utils from funkwhale_api.music import models, utils @@ -17,9 +18,9 @@ class Command(BaseCommand): help="Do not execute anything", ) parser.add_argument( - "--mimetypes", + "--mimetype", action="store_true", - dest="mimetypes", + dest="mimetype", default=True, help="Check and fix mimetypes", ) @@ -37,16 +38,33 @@ class Command(BaseCommand): default=False, help="Check and fix file size, can be really slow because it needs to access files", ) + parser.add_argument( + "--checksum", + action="store_true", + dest="checksum", + default=False, + help="Check and fix file size, can be really slow because it needs to access files", + ) + parser.add_argument( + "--batch-size", + "-s", + dest="batch_size", + default=1000, + type=int, + help="Size of each updated batch", + ) def handle(self, *args, **options): if options["dry_run"]: self.stdout.write("Dry-run on, will not commit anything") - if options["mimetypes"]: + if options["mimetype"]: self.fix_mimetypes(**options) if options["data"]: self.fix_file_data(**options) if options["size"]: self.fix_file_size(**options) + if options["checksum"]: + self.fix_file_checksum(**options) @transaction.atomic def fix_mimetypes(self, dry_run, **kwargs): @@ -54,11 +72,12 @@ class Command(BaseCommand): matching = models.Upload.objects.filter( Q(source__startswith="file://") | Q(source__startswith="upload://") ).exclude(mimetype__startswith="audio/") + total = matching.count() self.stdout.write( - "[mimetypes] {} entries found with bad or no mimetype".format( - matching.count() - ) + "[mimetypes] {} entries found with bad or no mimetype".format(total) ) + if not total: + return for extension, mimetype in utils.EXTENSION_TO_MIMETYPE.items(): qs = matching.filter(source__endswith=".{}".format(extension)) self.stdout.write( @@ -81,24 +100,36 @@ class Command(BaseCommand): ) if dry_run: return - for i, upload in enumerate(matching.only("audio_file")): - self.stdout.write( - "[bitrate/length] {}/{} fixing file #{}".format(i + 1, total, upload.pk) - ) - try: - audio_file = upload.get_audio_file() - if audio_file: + chunks = common_utils.chunk_queryset( + matching.only("id", "audio_file", "source"), kwargs["batch_size"] + ) + handled = 0 + for chunk in chunks: + updated = [] + for upload in chunk: + handled += 1 + self.stdout.write( + "[bitrate/length] {}/{} fixing file #{}".format( + handled, total, upload.pk + ) + ) + + try: + audio_file = upload.get_audio_file() data = utils.get_audio_file_data(audio_file) upload.bitrate = data["bitrate"] upload.duration = data["length"] - upload.save(update_fields=["duration", "bitrate"]) + except Exception as e: + self.stderr.write( + "[bitrate/length] error with file #{}: {}".format( + upload.pk, str(e) + ) + ) else: - self.stderr.write("[bitrate/length] no file found") - except Exception as e: - self.stderr.write( - "[bitrate/length] error with file #{}: {}".format(upload.pk, str(e)) - ) + updated.append(upload) + + models.Upload.objects.bulk_update(updated, ["bitrate", "duration"]) def fix_file_size(self, dry_run, **kwargs): self.stdout.write("Fixing missing size...") @@ -107,15 +138,64 @@ class Command(BaseCommand): self.stdout.write("[size] {} entries found with missing values".format(total)) if dry_run: return - for i, upload in enumerate(matching.only("size")): - self.stdout.write( - "[size] {}/{} fixing file #{}".format(i + 1, total, upload.pk) - ) - try: - upload.size = upload.get_file_size() - upload.save(update_fields=["size"]) - except Exception as e: - self.stderr.write( - "[size] error with file #{}: {}".format(upload.pk, str(e)) + chunks = common_utils.chunk_queryset( + matching.only("id", "audio_file", "source"), kwargs["batch_size"] + ) + handled = 0 + for chunk in chunks: + updated = [] + for upload in chunk: + handled += 1 + + self.stdout.write( + "[size] {}/{} fixing file #{}".format(handled, total, upload.pk) ) + + try: + upload.size = upload.get_file_size() + except Exception as e: + self.stderr.write( + "[size] error with file #{}: {}".format(upload.pk, str(e)) + ) + else: + updated.append(upload) + + models.Upload.objects.bulk_update(updated, ["size"]) + + def fix_file_checksum(self, dry_run, **kwargs): + self.stdout.write("Fixing missing checksums...") + matching = models.Upload.objects.filter( + Q(checksum=None) + & (Q(audio_file__isnull=False) | Q(source__startswith="file://")) + ) + total = matching.count() + self.stdout.write( + "[checksum] {} entries found with missing values".format(total) + ) + if dry_run: + return + chunks = common_utils.chunk_queryset( + matching.only("id", "audio_file", "source"), kwargs["batch_size"] + ) + handled = 0 + for chunk in chunks: + updated = [] + for upload in chunk: + handled += 1 + self.stdout.write( + "[checksum] {}/{} fixing file #{}".format(handled, total, upload.pk) + ) + + try: + upload.checksum = common_utils.get_file_hash( + upload.get_audio_file() + ) + except Exception as e: + self.stderr.write( + "[checksum] error with file #{}: {}".format(upload.pk, str(e)) + ) + else: + updated.append(upload) + + models.Upload.objects.bulk_update(updated, ["checksum"]) diff --git a/api/funkwhale_api/music/management/commands/import_files.py b/api/funkwhale_api/music/management/commands/import_files.py index cbffca63d..fab980510 100644 --- a/api/funkwhale_api/music/management/commands/import_files.py +++ b/api/funkwhale_api/music/management/commands/import_files.py @@ -1,17 +1,29 @@ +import collections +import datetime import itertools import os -import urllib.parse +import queue +import threading import time +import urllib.parse + +import watchdog.events +import watchdog.observers from django.conf import settings from django.core.files import File +from django.core.management import call_command from django.core.management.base import BaseCommand, CommandError +from django.db.models import Q from django.utils import timezone +from rest_framework import serializers + +from funkwhale_api.common import utils as common_utils from funkwhale_api.music import models, tasks, utils -def crawl_dir(dir, extensions, recursive=True): +def crawl_dir(dir, extensions, recursive=True, ignored=[]): if os.path.isfile(dir): yield dir return @@ -20,9 +32,12 @@ def crawl_dir(dir, extensions, recursive=True): if entry.is_file(): for e in extensions: if entry.name.lower().endswith(".{}".format(e.lower())): - yield entry.path + if entry.path not in ignored: + yield entry.path elif recursive and entry.is_dir(): - yield from crawl_dir(entry, extensions, recursive=recursive) + yield from crawl_dir( + entry, extensions, recursive=recursive, ignored=ignored + ) def batch(iterable, n=1): @@ -116,6 +131,17 @@ class Command(BaseCommand): "of overhead on your server and on servers you are federating with." ), ) + parser.add_argument( + "--watch", + action="store_true", + dest="watch", + default=False, + help=( + "Start the command in watch mode. Instead of running a full import, " + "and exit, watch the given path and import new files, remove deleted " + "files, and update metadata corresponding to updated files." + ), + ) parser.add_argument("-e", "--extension", nargs="+") parser.add_argument( @@ -128,6 +154,15 @@ class Command(BaseCommand): "This causes some overhead, so it's disabled by default." ), ) + parser.add_argument( + "--prune", + action="store_true", + dest="prune", + default=False, + help=( + "Once the import is completed, prune tracks, ablums and artists that aren't linked to any upload." + ), + ) parser.add_argument( "--reference", @@ -157,6 +192,8 @@ class Command(BaseCommand): ) def handle(self, *args, **options): + # handle relative directories + options["path"] = [os.path.abspath(path) for path in options["path"]] self.is_confirmed = False try: library = models.Library.objects.select_related("actor__user").get( @@ -182,22 +219,12 @@ class Command(BaseCommand): ) if p and not import_path.startswith(p): raise CommandError( - "Importing in-place only works if importing" + "Importing in-place only works if importing " "from {} (MUSIC_DIRECTORY_PATH), as this directory" "needs to be accessible by the webserver." "Culprit: {}".format(p, import_path) ) - extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS - crawler = itertools.chain( - *[ - crawl_dir(p, extensions=extensions, recursive=options["recursive"]) - for p in options["path"] - ] - ) - errors = [] - total = 0 - start_time = time.time() reference = options["reference"] or "cli-{}".format(timezone.now().isoformat()) import_url = "{}://{}/library/{}/upload?{}" @@ -212,8 +239,62 @@ class Command(BaseCommand): reference, import_url ) ) + extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS + if options["watch"]: + if len(options["path"]) > 1: + raise CommandError("Watch only work with a single directory") + + return self.setup_watcher( + extensions=extensions, + path=options["path"][0], + reference=reference, + library=library, + in_place=options["in_place"], + prune=options["prune"], + recursive=options["recursive"], + replace=options["replace"], + dispatch_outbox=options["outbox"], + broadcast=options["broadcast"], + ) + + update = True + checked_paths = set() + if options["in_place"] and update: + self.stdout.write("Checking existing files for updates…") + message = ( + "Are you sure you want to do this?\n\n" + "Type 'yes' to continue, or 'no' to skip checking for updates in " + "already imported files: " + ) + if options["interactive"] and input("".join(message)) != "yes": + pass + else: + checked_paths = check_updates( + stdout=self.stdout, + paths=options["path"], + extensions=extensions, + library=library, + batch_size=options["batch_size"], + ) + self.stdout.write("Existing files checked, moving on to next step!") + + crawler = itertools.chain( + *[ + crawl_dir( + p, + extensions=extensions, + recursive=options["recursive"], + ignored=checked_paths, + ) + for p in options["path"] + ] + ) + errors = [] + total = 0 + start_time = time.time() batch_start = None batch_duration = None + self.stdout.write("Starting import of new files…") for i, entries in enumerate(batch(crawler, options["batch_size"])): total += len(entries) batch_start = time.time() @@ -225,7 +306,7 @@ class Command(BaseCommand): if entries: self.stdout.write( "Handling batch {} ({} items){}".format( - i + 1, options["batch_size"], time_stats, + i + 1, len(entries), time_stats, ) ) batch_errors = self.handle_batch( @@ -240,9 +321,9 @@ class Command(BaseCommand): batch_duration = time.time() - batch_start - message = "Successfully imported {} tracks in {}s" + message = "Successfully imported {} new tracks in {}s" if options["async_"]: - message = "Successfully launched import for {} tracks in {}s" + message = "Successfully launched import for {} new tracks in {}s" self.stdout.write( message.format(total - len(errors), int(time.time() - start_time)) @@ -259,6 +340,12 @@ class Command(BaseCommand): ) ) + if options["prune"]: + self.stdout.write( + "Pruning dangling tracks, albums and artists from library…" + ) + prune() + def handle_batch(self, library, paths, batch, reference, options): matching = [] for m in paths: @@ -362,15 +449,15 @@ class Command(BaseCommand): message.format(batch=batch, path=path, i=i + 1, total=len(paths)) ) try: - self.create_upload( - path, - reference, - library, - async_, - options["replace"], - options["in_place"], - options["outbox"], - options["broadcast"], + create_upload( + path=path, + reference=reference, + library=library, + async_=async_, + replace=options["replace"], + in_place=options["in_place"], + dispatch_outbox=options["outbox"], + broadcast=options["broadcast"], ) except Exception as e: if options["exit_on_failure"]: @@ -382,34 +469,311 @@ class Command(BaseCommand): errors.append((path, "{} {}".format(e.__class__.__name__, e))) return errors - def create_upload( - self, - path, - reference, - library, - async_, - replace, - in_place, - dispatch_outbox, - broadcast, - ): - import_handler = tasks.process_upload.delay if async_ else tasks.process_upload - upload = models.Upload(library=library, import_reference=reference) - upload.source = "file://" + path - upload.import_metadata = { - "funkwhale": { - "config": { - "replace": replace, - "dispatch_outbox": dispatch_outbox, - "broadcast": broadcast, - } + def setup_watcher(self, path, extensions, recursive, **kwargs): + watchdog_queue = queue.Queue() + # Set up a worker thread to process database load + worker = threading.Thread( + target=process_load_queue(self.stdout, **kwargs), args=(watchdog_queue,), + ) + worker.setDaemon(True) + worker.start() + + # setup watchdog to monitor directory for trigger files + patterns = ["*.{}".format(e) for e in extensions] + event_handler = Watcher( + stdout=self.stdout, queue=watchdog_queue, patterns=patterns, + ) + observer = watchdog.observers.Observer() + observer.schedule(event_handler, path, recursive=recursive) + observer.start() + + try: + while True: + self.stdout.write( + "Watching for changes at {}…".format(path), ending="\r" + ) + time.sleep(10) + if kwargs["prune"] and GLOBAL["need_pruning"]: + self.stdout.write("Some files were deleted, pruning library…") + prune() + GLOBAL["need_pruning"] = False + except KeyboardInterrupt: + self.stdout.write("Exiting…") + observer.stop() + + observer.join() + + +GLOBAL = {"need_pruning": False} + + +def prune(): + call_command( + "prune_library", + dry_run=False, + prune_artists=True, + prune_albums=True, + prune_tracks=True, + ) + + +def create_upload( + path, reference, library, async_, replace, in_place, dispatch_outbox, broadcast, +): + import_handler = tasks.process_upload.delay if async_ else tasks.process_upload + upload = models.Upload(library=library, import_reference=reference) + upload.source = "file://" + path + upload.import_metadata = { + "funkwhale": { + "config": { + "replace": replace, + "dispatch_outbox": dispatch_outbox, + "broadcast": broadcast, } } - if not in_place: - name = os.path.basename(path) - with open(path, "rb") as f: - upload.audio_file.save(name, File(f), save=False) + } + if not in_place: + name = os.path.basename(path) + with open(path, "rb") as f: + upload.audio_file.save(name, File(f), save=False) - upload.save() + upload.save() - import_handler(upload_id=upload.pk) + import_handler(upload_id=upload.pk) + + +def process_load_queue(stdout, **kwargs): + def inner(q): + # we batch events, to avoid calling same methods multiple times if a file is modified + # a lot in a really short time + flush_delay = 2 + batched_events = collections.OrderedDict() + while True: + while True: + if not q.empty(): + event = q.get() + batched_events[event["path"]] = event + else: + break + for path, event in batched_events.copy().items(): + if time.time() - event["time"] <= flush_delay: + continue + now = datetime.datetime.utcnow() + stdout.write( + "{} -- Processing {}:{}...\n".format( + now.strftime("%Y/%m/%d %H:%M:%S"), event["type"], event["path"] + ) + ) + del batched_events[path] + handle_event(event, stdout=stdout, **kwargs) + time.sleep(1) + + return inner + + +class Watcher(watchdog.events.PatternMatchingEventHandler): + def __init__(self, stdout, queue, patterns): + self.stdout = stdout + self.queue = queue + super().__init__(patterns=patterns) + + def enqueue(self, event): + e = { + "is_directory": event.is_directory, + "type": event.event_type, + "path": event.src_path, + "src_path": event.src_path, + "dest_path": getattr(event, "dest_path", None), + "time": time.time(), + } + self.queue.put(e) + + def on_moved(self, event): + self.enqueue(event) + + def on_created(self, event): + self.enqueue(event) + + def on_deleted(self, event): + self.enqueue(event) + + def on_modified(self, event): + self.enqueue(event) + + +def handle_event(event, stdout, **kwargs): + handlers = { + "modified": handle_modified, + "created": handle_created, + "moved": handle_moved, + "deleted": handle_deleted, + } + handlers[event["type"]](event=event, stdout=stdout, **kwargs) + + +def handle_modified(event, stdout, library, in_place, **kwargs): + existing_candidates = library.uploads.filter(import_status="finished") + with open(event["path"], "rb") as f: + checksum = common_utils.get_file_hash(f) + + existing = existing_candidates.filter(checksum=checksum).first() + if existing: + # found an existing file with same checksum, nothing to do + stdout.write(" File already imported and metadata is up-to-date") + return + + to_update = None + if in_place: + source = "file://{}".format(event["path"]) + to_update = ( + existing_candidates.in_place() + .filter(source=source) + .select_related( + "track__attributed_to", "track__artist", "track__album__artist", + ) + .first() + ) + if to_update: + if ( + to_update.track.attributed_to + and to_update.track.attributed_to != library.actor + ): + stdout.write( + " Cannot update track metadata, track belongs to someone else".format( + to_update.pk + ) + ) + return + else: + stdout.write( + " Updating existing file #{} with new metadata…".format( + to_update.pk + ) + ) + audio_metadata = to_update.get_metadata() + try: + tasks.update_track_metadata(audio_metadata, to_update.track) + except serializers.ValidationError as e: + stdout.write(" Invalid metadata: {}".format(e)) + else: + to_update.checksum = checksum + to_update.save(update_fields=["checksum"]) + return + + stdout.write(" Launching import for new file") + create_upload( + path=event["path"], + reference=kwargs["reference"], + library=library, + async_=False, + replace=kwargs["replace"], + in_place=in_place, + dispatch_outbox=kwargs["dispatch_outbox"], + broadcast=kwargs["broadcast"], + ) + + +def handle_created(event, stdout, **kwargs): + """ + Created is essentially an alias for modified, because for instance when copying a file in the watched directory, + a created event will be fired on the initial touch, then many modified event (as the file is written). + """ + return handle_modified(event, stdout, **kwargs) + + +def handle_moved(event, stdout, library, in_place, **kwargs): + if not in_place: + return + + old_source = "file://{}".format(event["src_path"]) + new_source = "file://{}".format(event["dest_path"]) + existing_candidates = library.uploads.filter(import_status="finished") + existing_candidates = existing_candidates.in_place().filter(source=old_source) + existing = existing_candidates.first() + if existing: + stdout.write(" Updating path of existing file #{}".format(existing.pk)) + existing.source = new_source + existing.save(update_fields=["source"]) + + +def handle_deleted(event, stdout, library, in_place, **kwargs): + if not in_place: + return + source = "file://{}".format(event["path"]) + existing_candidates = library.uploads.filter(import_status="finished") + existing_candidates = existing_candidates.in_place().filter(source=source) + if existing_candidates.count(): + stdout.write(" Removing file from DB") + existing_candidates.delete() + GLOBAL["need_pruning"] = True + + +def check_updates(stdout, library, extensions, paths, batch_size): + existing = ( + library.uploads.in_place() + .filter(import_status="finished") + .exclude(checksum=None) + .select_related("library", "track") + ) + queries = [] + checked_paths = set() + for path in paths: + for ext in extensions: + queries.append( + Q(source__startswith="file://{}".format(path)) + & Q(source__endswith=".{}".format(ext)) + ) + query, remainder = queries[0], queries[1:] + for q in remainder: + query = q | query + existing = existing.filter(query) + total = existing.count() + stdout.write("Found {} files to check in database!".format(total)) + uploads = existing.order_by("source") + for i, rows in enumerate(batch(uploads.iterator(), batch_size)): + stdout.write("Handling batch {} ({} items)".format(i + 1, len(rows),)) + + for upload in rows: + + check_upload(stdout, upload) + checked_paths.add(upload.source.replace("file://", "", 1)) + + return checked_paths + + +def check_upload(stdout, upload): + try: + audio_file = upload.get_audio_file() + except FileNotFoundError: + stdout.write( + " Removing file #{} missing from disk at {}".format( + upload.pk, upload.source + ) + ) + return upload.delete() + + checksum = common_utils.get_file_hash(audio_file) + if upload.checksum != checksum: + stdout.write( + " File #{} at {} was modified, updating metadata…".format( + upload.pk, upload.source + ) + ) + if upload.library.actor_id != upload.track.attributed_to_id: + stdout.write( + " Cannot update track metadata, track belongs to someone else".format( + upload.pk + ) + ) + else: + track = models.Track.objects.select_related("artist", "album__artist").get( + pk=upload.track_id + ) + try: + tasks.update_track_metadata(upload.get_metadata(), track) + except serializers.ValidationError as e: + stdout.write(" Invalid metadata: {}".format(e)) + return + else: + upload.checksum = checksum + upload.save(update_fields=["checksum"]) diff --git a/api/funkwhale_api/music/migrations/0052_auto_20200505_0810.py b/api/funkwhale_api/music/migrations/0052_auto_20200505_0810.py new file mode 100644 index 000000000..2a0526643 --- /dev/null +++ b/api/funkwhale_api/music/migrations/0052_auto_20200505_0810.py @@ -0,0 +1,23 @@ +# Generated by Django 3.0.4 on 2020-05-05 08:10 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('music', '0051_auto_20200319_1249'), + ] + + operations = [ + migrations.AddField( + model_name='upload', + name='checksum', + field=models.CharField(blank=True, db_index=True, max_length=100, null=True), + ), + migrations.AlterField( + model_name='uploadversion', + name='mimetype', + field=models.CharField(choices=[('audio/mp3', 'mp3'), ('audio/mpeg3', 'mp3'), ('audio/x-mp3', 'mp3'), ('audio/mpeg', 'mp3'), ('video/ogg', 'ogg'), ('audio/ogg', 'ogg'), ('audio/opus', 'opus'), ('audio/x-m4a', 'aac'), ('audio/x-m4a', 'm4a'), ('audio/x-flac', 'flac'), ('audio/flac', 'flac')], max_length=50), + ), + ] diff --git a/api/funkwhale_api/music/models.py b/api/funkwhale_api/music/models.py index ecb616b41..43371d5d8 100644 --- a/api/funkwhale_api/music/models.py +++ b/api/funkwhale_api/music/models.py @@ -655,6 +655,14 @@ class Track(APIModelMixin): class UploadQuerySet(common_models.NullsLastQuerySet): + def in_place(self, include=True): + query = models.Q(source__startswith="file://") & ( + models.Q(audio_file="") | models.Q(audio_file=None) + ) + if not include: + query = ~query + return self.filter(query) + def playable_by(self, actor, include=True): libraries = Library.objects.viewable_by(actor) @@ -754,6 +762,9 @@ class Upload(models.Model): ) downloads_count = models.PositiveIntegerField(default=0) + # stores checksums such as `sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855` + checksum = models.CharField(max_length=100, db_index=True, null=True, blank=True) + objects = UploadQuerySet.as_manager() @property @@ -833,7 +844,7 @@ class Upload(models.Model): def get_audio_file(self): if self.audio_file: return self.audio_file.open() - if self.source.startswith("file://"): + if self.source and self.source.startswith("file://"): return open(self.source.replace("file://", "", 1), "rb") def get_audio_data(self): @@ -866,6 +877,15 @@ class Upload(models.Model): self.mimetype = mimetypes.guess_type(self.source)[0] if not self.size and self.audio_file: self.size = self.audio_file.size + if not self.checksum: + try: + audio_file = self.get_audio_file() + except FileNotFoundError: + pass + else: + if audio_file: + self.checksum = common_utils.get_file_hash(audio_file) + if not self.pk and not self.fid and self.library.actor.get_user(): self.fid = self.get_federation_id() return super().save(**kwargs) diff --git a/api/funkwhale_api/music/tasks.py b/api/funkwhale_api/music/tasks.py index 89ee1a88f..722a9eefb 100644 --- a/api/funkwhale_api/music/tasks.py +++ b/api/funkwhale_api/music/tasks.py @@ -851,3 +851,71 @@ def update_library_entity(obj, data): obj.save(update_fields=list(data.keys())) return obj + + +UPDATE_CONFIG = { + "track": { + "position": {}, + "title": {}, + "mbid": {}, + "disc_number": {}, + "copyright": {}, + "license": { + "getter": lambda data, field: licenses.match( + data.get("license"), data.get("copyright") + ) + }, + }, + "album": {"title": {}, "mbid": {}, "release_date": {}}, + "artist": {"name": {}, "mbid": {}}, + "album_artist": {"name": {}, "mbid": {}}, +} + + +@transaction.atomic +def update_track_metadata(audio_metadata, track): + # XXX: implement this to support updating metadata when an imported files + # is updated by an outside tool (e.g beets). + serializer = metadata.TrackMetadataSerializer(data=audio_metadata) + serializer.is_valid(raise_exception=True) + new_data = serializer.validated_data + + to_update = [ + ("track", track, lambda data: data), + ("album", track.album, lambda data: data["album"]), + ("artist", track.artist, lambda data: data["artists"][0]), + ( + "album_artist", + track.album.artist if track.album else None, + lambda data: data["album"]["artists"][0], + ), + ] + for id, obj, data_getter in to_update: + if not obj: + continue + obj_updated_fields = [] + try: + obj_data = data_getter(new_data) + except IndexError: + continue + for field, config in UPDATE_CONFIG[id].items(): + getter = config.get( + "getter", lambda data, field: data[config.get("field", field)] + ) + try: + new_value = getter(obj_data, field) + except KeyError: + continue + old_value = getattr(obj, field) + if new_value == old_value: + continue + obj_updated_fields.append(field) + setattr(obj, field, new_value) + + if obj_updated_fields: + obj.save(update_fields=obj_updated_fields) + + if track.album and "album" in new_data and new_data["album"].get("cover_data"): + common_utils.attach_file( + track.album, "attachment_cover", new_data["album"].get("cover_data") + ) diff --git a/api/requirements/base.txt b/api/requirements/base.txt index ecb3d9b5e..6e31f857e 100644 --- a/api/requirements/base.txt +++ b/api/requirements/base.txt @@ -83,3 +83,4 @@ service_identity==18.1.0 markdown>=3.2,<4 bleach>=3,<4 feedparser==6.0.0b3 +watchdog==0.10.2 diff --git a/api/tests/common/test_utils.py b/api/tests/common/test_utils.py index ffe9eae3b..303c8cbca 100644 --- a/api/tests/common/test_utils.py +++ b/api/tests/common/test_utils.py @@ -258,3 +258,12 @@ def test_monkey_patch_request_build_absolute_uri( request = fake_request.get("/", **meta) assert request.build_absolute_uri(path) == expected + + +def test_get_file_hash(tmpfile, settings): + settings.HASHING_ALGORITHM = "sha256" + content = b"hello" + tmpfile.write(content) + # echo -n "hello" | sha256sum + expected = "sha256:2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824" + assert utils.get_file_hash(tmpfile) == expected diff --git a/api/tests/music/test_commands.py b/api/tests/music/test_commands.py index a08f1b10b..329dcffb5 100644 --- a/api/tests/music/test_commands.py +++ b/api/tests/music/test_commands.py @@ -1,6 +1,7 @@ import os import pytest +from funkwhale_api.common import utils as common_utils from funkwhale_api.music.management.commands import check_inplace_files from funkwhale_api.music.management.commands import fix_uploads from funkwhale_api.music.management.commands import prune_library @@ -18,7 +19,7 @@ def test_fix_uploads_bitrate_length(factories, mocker): return_value={"bitrate": 42, "length": 43}, ) - c.fix_file_data(dry_run=False) + c.fix_file_data(dry_run=False, batch_size=100) upload1.refresh_from_db() upload2.refresh_from_db() @@ -41,7 +42,7 @@ def test_fix_uploads_size(factories, mocker): mocker.patch("funkwhale_api.music.models.Upload.get_file_size", return_value=2) - c.fix_file_size(dry_run=False) + c.fix_file_size(dry_run=False, batch_size=100) upload1.refresh_from_db() upload2.refresh_from_db() @@ -69,7 +70,7 @@ def test_fix_uploads_mimetype(factories, mocker): mimetype="audio/something", ) c = fix_uploads.Command() - c.fix_mimetypes(dry_run=False) + c.fix_mimetypes(dry_run=False, batch_size=100) upload1.refresh_from_db() upload2.refresh_from_db() @@ -78,6 +79,25 @@ def test_fix_uploads_mimetype(factories, mocker): assert upload2.mimetype == "audio/something" +def test_fix_uploads_checksum(factories, mocker): + upload1 = factories["music.Upload"]() + upload2 = factories["music.Upload"]() + upload1.__class__.objects.filter(pk=upload1.pk).update(checksum="test") + upload2.__class__.objects.filter(pk=upload2.pk).update(checksum=None) + c = fix_uploads.Command() + + c.fix_file_checksum(dry_run=False, batch_size=100) + + upload1.refresh_from_db() + upload2.refresh_from_db() + + # not updated + assert upload1.checksum == "test" + + # updated + assert upload2.checksum == common_utils.get_file_hash(upload2.audio_file) + + def test_prune_library_dry_run(factories): prunable = factories["music.Track"]() not_prunable = factories["music.Track"]() diff --git a/api/tests/music/test_models.py b/api/tests/music/test_models.py index 74d5e9604..d8e3bb444 100644 --- a/api/tests/music/test_models.py +++ b/api/tests/music/test_models.py @@ -5,6 +5,7 @@ import pytest from django.utils import timezone from django.urls import reverse +from funkwhale_api.common import utils as common_utils from funkwhale_api.music import importers, models, tasks from funkwhale_api.federation import utils as federation_utils @@ -164,6 +165,17 @@ def test_audio_track_mime_type(extention, mimetype, factories): assert upload.mimetype == mimetype +@pytest.mark.parametrize("name", ["test.ogg", "test.mp3"]) +def test_audio_track_checksum(name, factories): + + path = os.path.join(DATA_DIR, name) + upload = factories["music.Upload"](audio_file__from_path=path, mimetype=None) + + with open(path, "rb") as f: + expected = common_utils.get_file_hash(f) + assert upload.checksum == expected + + def test_upload_file_name(factories): name = "test.mp3" path = os.path.join(DATA_DIR, name) diff --git a/api/tests/music/test_tasks.py b/api/tests/music/test_tasks.py index 4fbe6b513..96ba451aa 100644 --- a/api/tests/music/test_tasks.py +++ b/api/tests/music/test_tasks.py @@ -1329,3 +1329,40 @@ def test_can_import_track_with_same_position_in_same_discs_skipped(factories, mo new_upload.refresh_from_db() assert new_upload.import_status == "skipped" + + +def test_update_track_metadata(factories): + track = factories["music.Track"]() + data = { + "title": "Peer Gynt Suite no. 1, op. 46: I. Morning", + "artist": "Edvard Grieg", + "album_artist": "Edvard Grieg; Musopen Symphony Orchestra", + "album": "Peer Gynt Suite no. 1, op. 46", + "date": "2012-08-15", + "position": "4", + "disc_number": "2", + "musicbrainz_albumid": "a766da8b-8336-47aa-a3ee-371cc41ccc75", + "mbid": "bd21ac48-46d8-4e78-925f-d9cc2a294656", + "musicbrainz_artistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823", + "musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f", + "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", + "copyright": "Someone", + "comment": "hello there", + } + tasks.update_track_metadata(metadata.FakeMetadata(data), track) + + track.refresh_from_db() + + assert track.title == data["title"] + assert track.position == int(data["position"]) + assert track.disc_number == int(data["disc_number"]) + assert track.license.code == "cc-by-sa-4.0" + assert track.copyright == data["copyright"] + assert str(track.mbid) == data["mbid"] + assert track.album.title == data["album"] + assert track.album.release_date == datetime.date(2012, 8, 15) + assert str(track.album.mbid) == data["musicbrainz_albumid"] + assert track.artist.name == data["artist"] + assert str(track.artist.mbid) == data["musicbrainz_artistid"] + assert track.album.artist.name == "Edvard Grieg" + assert str(track.album.artist.mbid) == "013c8e5b-d72a-4cd3-8dee-6c64d6125823" diff --git a/api/tests/test_import_audio_file.py b/api/tests/test_import_audio_file.py index c7d5a976c..04c06a8f4 100644 --- a/api/tests/test_import_audio_file.py +++ b/api/tests/test_import_audio_file.py @@ -4,6 +4,8 @@ import pytest from django.core.management import call_command from django.core.management.base import CommandError +from funkwhale_api.common import utils as common_utils +from funkwhale_api.music.management.commands import import_files DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "files") @@ -159,3 +161,194 @@ def test_import_files_in_place(factories, mocker, settings): def test_storage_rename_utf_8_files(factories): upload = factories["music.Upload"](audio_file__filename="été.ogg") assert upload.audio_file.name.endswith("ete.ogg") + + +@pytest.mark.parametrize("name", ["modified", "moved", "created", "deleted"]) +def test_handle_event(name, mocker): + handler = mocker.patch.object(import_files, "handle_{}".format(name)) + + event = {"type": name} + stdout = mocker.Mock() + kwargs = {"hello": "world"} + import_files.handle_event(event, stdout, **kwargs) + + handler.assert_called_once_with(event=event, stdout=stdout, **kwargs) + + +def test_handle_created(mocker): + handle_modified = mocker.patch.object(import_files, "handle_modified") + + event = mocker.Mock() + stdout = mocker.Mock() + kwargs = {"hello": "world"} + import_files.handle_created(event, stdout, **kwargs) + + handle_modified.assert_called_once_with(event, stdout, **kwargs) + + +def test_handle_deleted(factories, mocker): + stdout = mocker.Mock() + event = { + "path": "/path.mp3", + } + library = factories["music.Library"]() + deleted = factories["music.Upload"]( + library=library, + source="file://{}".format(event["path"]), + import_status="finished", + audio_file=None, + ) + kept = [ + factories["music.Upload"]( + library=library, + source="file://{}".format(event["path"]), + import_status="finished", + ), + factories["music.Upload"]( + source="file://{}".format(event["path"]), + import_status="finished", + audio_file=None, + ), + ] + + import_files.handle_deleted( + event=event, stdout=stdout, library=library, in_place=True + ) + + with pytest.raises(deleted.DoesNotExist): + deleted.refresh_from_db() + + for upload in kept: + upload.refresh_from_db() + + +def test_handle_moved(factories, mocker): + stdout = mocker.Mock() + event = { + "src_path": "/path.mp3", + "dest_path": "/new_path.mp3", + } + library = factories["music.Library"]() + updated = factories["music.Upload"]( + library=library, + source="file://{}".format(event["src_path"]), + import_status="finished", + audio_file=None, + ) + untouched = [ + factories["music.Upload"]( + library=library, + source="file://{}".format(event["src_path"]), + import_status="finished", + ), + factories["music.Upload"]( + source="file://{}".format(event["src_path"]), + import_status="finished", + audio_file=None, + ), + ] + + import_files.handle_moved( + event=event, stdout=stdout, library=library, in_place=True + ) + + updated.refresh_from_db() + assert updated.source == "file://{}".format(event["dest_path"]) + for upload in untouched: + source = upload.source + upload.refresh_from_db() + assert source == upload.source + + +def test_handle_modified_creates_upload(tmpfile, factories, mocker): + stdout = mocker.Mock() + event = { + "path": tmpfile.name, + } + process_upload = mocker.patch("funkwhale_api.music.tasks.process_upload") + library = factories["music.Library"]() + import_files.handle_modified( + event=event, + stdout=stdout, + library=library, + in_place=True, + reference="hello", + replace=False, + dispatch_outbox=False, + broadcast=False, + ) + upload = library.uploads.latest("id") + assert upload.source == "file://{}".format(event["path"]) + + process_upload.assert_called_once_with(upload_id=upload.pk) + + +def test_handle_modified_skips_existing_checksum(tmpfile, factories, mocker): + stdout = mocker.Mock() + event = { + "path": tmpfile.name, + } + tmpfile.write(b"hello") + + library = factories["music.Library"]() + factories["music.Upload"]( + checksum=common_utils.get_file_hash(tmpfile), + library=library, + import_status="finished", + ) + import_files.handle_modified( + event=event, stdout=stdout, library=library, in_place=True, + ) + assert library.uploads.count() == 1 + + +def test_handle_modified_update_existing_path_if_found(tmpfile, factories, mocker): + stdout = mocker.Mock() + event = { + "path": tmpfile.name, + } + update_track_metadata = mocker.patch( + "funkwhale_api.music.tasks.update_track_metadata" + ) + get_metadata = mocker.patch("funkwhale_api.music.models.Upload.get_metadata") + library = factories["music.Library"]() + track = factories["music.Track"](attributed_to=library.actor) + upload = factories["music.Upload"]( + source="file://{}".format(event["path"]), + track=track, + checksum="old", + library=library, + import_status="finished", + audio_file=None, + ) + import_files.handle_modified( + event=event, stdout=stdout, library=library, in_place=True, + ) + update_track_metadata.assert_called_once_with( + get_metadata.return_value, upload.track, + ) + + +def test_handle_modified_update_existing_path_if_found_and_attributed_to( + tmpfile, factories, mocker +): + stdout = mocker.Mock() + event = { + "path": tmpfile.name, + } + update_track_metadata = mocker.patch( + "funkwhale_api.music.tasks.update_track_metadata" + ) + library = factories["music.Library"]() + factories["music.Upload"]( + source="file://{}".format(event["path"]), + checksum="old", + library=library, + track__attributed_to=factories["federation.Actor"](), + import_status="finished", + audio_file=None, + ) + import_files.handle_modified( + event=event, stdout=stdout, library=library, in_place=True, + ) + update_track_metadata.assert_not_called() diff --git a/changes/changelog.d/1093.bugfix b/changes/changelog.d/1093.bugfix index 1d9a15e59..0295841af 100644 --- a/changes/changelog.d/1093.bugfix +++ b/changes/changelog.d/1093.bugfix @@ -1 +1 @@ -Fixed mimetype detection issue that broke transcoding on some tracks (#1093). Run ``python manage.py fix_uploads --mimetypes`` to set proper mimetypes on existing uploads. +Fixed mimetype detection issue that broke transcoding on some tracks (#1093). Run ``python manage.py fix_uploads --mimetype`` to set proper mimetypes on existing uploads. diff --git a/changes/changelog.d/721.feature b/changes/changelog.d/721.feature new file mode 100644 index 000000000..332f992eb --- /dev/null +++ b/changes/changelog.d/721.feature @@ -0,0 +1 @@ +Support a --watch mode with ``import_files`` to automatically add, update and remove files when filesystem is updated (#721) diff --git a/docs/admin/importing-music.rst b/docs/admin/importing-music.rst index bc0a642cb..bff23845b 100644 --- a/docs/admin/importing-music.rst +++ b/docs/admin/importing-music.rst @@ -1,15 +1,21 @@ -Importing music -================ +Importing music from the server +=============================== -From music directory on the server ----------------------------------- - -You can import music files in Funkwhale assuming they are located on the server -and readable by the Funkwhale application. Your music files should contain at +Funkwhale can import music files that are located on the server assuming +they readable by the Funkwhale application. Your music files should contain at least an ``artist``, ``album`` and ``title`` tags, but we recommend you tag it extensively using a proper tool, such as Beets or Musicbrainz Picard. -You can import those tracks as follows, assuming they are located in +Funkwhale supports two different import modes: + +- copy (the default): files are copied into Funkwhale's internal storage. This means importing a 1GB library will result in the same amount of space being used by Funkwhale. +- :ref:`in-place ` (when the ``--in-place`` is provided): files are referenced in Funkwhale's DB but not copied or touched in anyway. This is useful if you have a huge library, or one that is updated by an external tool such as Beets.. + +.. note:: + + In Funkwhale 1.0, **the default behaviour will change to in-place import** + +Regardless of the mode you're choosing, import works as described below, assuming your files are located in ``/srv/funkwhale/data/music``: .. code-block:: bash @@ -17,6 +23,17 @@ You can import those tracks as follows, assuming they are located in export LIBRARY_ID="" python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput +.. note:: + You'll have to create a library in the Web UI before to get your library ID. Simply visit + https://yourdomain/content/libraries/ to create one. + + Library IDs are available in library urls or sharing link. In this example: + https://funkwhale.instance/content/libraries/769a2ae3-eb3d-4aff-9f94-2c4d80d5c2d1, + the library ID is 769a2bc3-eb1d-4aff-9f84-2c4d80d5c2d1 + + You can use only the first characters of the ID when calling the command, like that: + ``export LIBRARY_ID="769a2bc3"`` + When you use docker, the ``/srv/funkwhale/data/music`` is mounted from the host to the ``/music`` directory on the container: @@ -32,16 +49,6 @@ When you installed Funkwhale via ansible, you need to call a script instead of P export LIBRARY_ID="" /srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput -.. note:: - You'll have to create a library in the Web UI before to get your library ID. Simply visit - https://yourdomain/content/libraries/ to create one. - - Library IDs are available in library urls or sharing link. In this example: - https://funkwhale.instance/content/libraries/769a2ae3-eb3d-4aff-9f94-2c4d80d5c2d1, - the library ID is 769a2bc3-eb1d-4aff-9f84-2c4d80d5c2d1 - - You can use only the first characters of the ID when calling the command, like that: - ``export LIBRARY_ID="769a2bc3"`` The import command supports several options, and you can check the help to get details:: @@ -63,6 +70,7 @@ get details:: At the moment, only Flac, OGG/Vorbis and MP3 files with ID3 tags are supported + .. _in-place-import: In-place import @@ -88,14 +96,6 @@ configuration options to ensure the webserver can serve them properly: - :ref:`setting-MUSIC_DIRECTORY_PATH` - :ref:`setting-MUSIC_DIRECTORY_SERVE_PATH` -.. warning:: - - While in-place import is faster and less disk-space-hungry, it's also - more fragile: if, for some reason, you move or rename the source files, - Funkwhale will not be able to serve those files anymore. - - Thus, be especially careful when you manipulate the source files. - We recommend you symlink all your music directories into ``/srv/funkwhale/data/music`` and run the `import_files` command from that directory. This will make it possible to use multiple music directories, without any additional configuration @@ -134,6 +134,49 @@ If you want to go with symlinks, ensure each symlinked directory is mounted as a # add your symlinked dirs here - /media/nfsshare:/media/nfsshare:ro +Metadata updates +^^^^^^^^^^^^^^^^ + +When doing an import with in ``in-place`` mode, the importer will also check and update existing entries +found in the database. For instance, if a file was imported, the ID3 Title tag was updated, and you rerun a scan, +Funkwhale will pick up the new title. The following fields can be updated this way: + +- Track mbid +- Track title +- Track position and disc number +- Track license and copyright +- Album cover +- Album title +- Album mbid +- Album release date +- Artist name +- Artist mbid +- Album artist name +- Album artist mbid + + +React to filesystem events with ``--watch`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you have a really big library or one that is updated quite often, running the ``import_files`` command by hand +may not be practical. To help with this use case, the ``import_files`` command supports a ``--watch`` flag that will observes filesystem events +instead of performing a full import. + +File creation, move, update and removal are handled when ``--watch`` is provided: + +- Files created in the watched directory are imported immediatly +- If using ``in-place`` mode, files updates trigger a metadata update on the corresponding entries +- If using ``in-place`` mode, files that are moved and known by Funkwhale will see their path updated in Funkwhale's DB +- If using ``in-place`` mode, files that are removed and known by Funkwhale will be removed from Funkwhale's DB + +Pruning dangling metadata with ``--prune`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Funkwhale is, by design, conservative with music metadata in its database. If you remove a file from Funkwhale's DB, +the corresponding artist, album and track object won't be deleted by default. + +If you want to prune dangling metadata from the database once the ``import_files`` command is over, simply add the ``--prune`` flag. +This also works in with ``--watch``. Album covers ^^^^^^^^^^^^ @@ -159,9 +202,3 @@ under creative commons (courtesy of Jamendo): ./download-tracks.sh music.txt This will download a bunch of zip archives (one per album) under the ``data/music`` directory and unzip their content. - -From other instances --------------------- - -Funkwhale also supports importing music from other instances. Please refer -to :doc:`../federation/index` for more details.