Importer updates: watch directories, handle metadata updates

merge-requests/1094/merge
Agate 2020-05-07 09:55:29 +02:00
rodzic a179229f6d
commit 6eb049b2d9
16 zmienionych plików z 1005 dodań i 120 usunięć

Wyświetl plik

@ -1309,3 +1309,6 @@ IGNORE_FORWARDED_HOST_AND_PROTO = env.bool(
"""
Use :attr:`FUNKWHALE_HOSTNAME` and :attr:`FUNKWHALE_PROTOCOL ` instead of request header.
"""
HASHING_ALGORITHM = "sha256"
HASHING_CHUNK_SIZE = 1024 * 100

Wyświetl plik

@ -1,4 +1,5 @@
import datetime
import hashlib
from django.core.files.base import ContentFile
from django.http import request
@ -458,3 +459,19 @@ def monkey_patch_request_build_absolute_uri():
request.HttpRequest.scheme = property(scheme)
request.HttpRequest.get_host = get_host
def get_file_hash(file, algo=None, chunk_size=None, full_read=False):
algo = algo or settings.HASHING_ALGORITHM
chunk_size = chunk_size or settings.HASHING_CHUNK_SIZE
handler = getattr(hashlib, algo)
hash = handler()
file.seek(0)
if full_read:
for byte_block in iter(lambda: file.read(chunk_size), b""):
hash.update(byte_block)
else:
# sometimes, it's useful to only hash the beginning of the file, e.g
# to avoid a lot of I/O when crawling large libraries
hash.update(file.read(chunk_size))
return "{}:{}".format(algo, hash.hexdigest())

Wyświetl plik

@ -2,6 +2,7 @@ from django.core.management.base import BaseCommand
from django.db import transaction
from django.db.models import Q
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music import models, utils
@ -17,9 +18,9 @@ class Command(BaseCommand):
help="Do not execute anything",
)
parser.add_argument(
"--mimetypes",
"--mimetype",
action="store_true",
dest="mimetypes",
dest="mimetype",
default=True,
help="Check and fix mimetypes",
)
@ -37,16 +38,33 @@ class Command(BaseCommand):
default=False,
help="Check and fix file size, can be really slow because it needs to access files",
)
parser.add_argument(
"--checksum",
action="store_true",
dest="checksum",
default=False,
help="Check and fix file size, can be really slow because it needs to access files",
)
parser.add_argument(
"--batch-size",
"-s",
dest="batch_size",
default=1000,
type=int,
help="Size of each updated batch",
)
def handle(self, *args, **options):
if options["dry_run"]:
self.stdout.write("Dry-run on, will not commit anything")
if options["mimetypes"]:
if options["mimetype"]:
self.fix_mimetypes(**options)
if options["data"]:
self.fix_file_data(**options)
if options["size"]:
self.fix_file_size(**options)
if options["checksum"]:
self.fix_file_checksum(**options)
@transaction.atomic
def fix_mimetypes(self, dry_run, **kwargs):
@ -54,11 +72,12 @@ class Command(BaseCommand):
matching = models.Upload.objects.filter(
Q(source__startswith="file://") | Q(source__startswith="upload://")
).exclude(mimetype__startswith="audio/")
total = matching.count()
self.stdout.write(
"[mimetypes] {} entries found with bad or no mimetype".format(
matching.count()
)
"[mimetypes] {} entries found with bad or no mimetype".format(total)
)
if not total:
return
for extension, mimetype in utils.EXTENSION_TO_MIMETYPE.items():
qs = matching.filter(source__endswith=".{}".format(extension))
self.stdout.write(
@ -81,24 +100,36 @@ class Command(BaseCommand):
)
if dry_run:
return
for i, upload in enumerate(matching.only("audio_file")):
self.stdout.write(
"[bitrate/length] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
)
try:
audio_file = upload.get_audio_file()
if audio_file:
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[bitrate/length] {}/{} fixing file #{}".format(
handled, total, upload.pk
)
)
try:
audio_file = upload.get_audio_file()
data = utils.get_audio_file_data(audio_file)
upload.bitrate = data["bitrate"]
upload.duration = data["length"]
upload.save(update_fields=["duration", "bitrate"])
except Exception as e:
self.stderr.write(
"[bitrate/length] error with file #{}: {}".format(
upload.pk, str(e)
)
)
else:
self.stderr.write("[bitrate/length] no file found")
except Exception as e:
self.stderr.write(
"[bitrate/length] error with file #{}: {}".format(upload.pk, str(e))
)
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["bitrate", "duration"])
def fix_file_size(self, dry_run, **kwargs):
self.stdout.write("Fixing missing size...")
@ -107,15 +138,64 @@ class Command(BaseCommand):
self.stdout.write("[size] {} entries found with missing values".format(total))
if dry_run:
return
for i, upload in enumerate(matching.only("size")):
self.stdout.write(
"[size] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
)
try:
upload.size = upload.get_file_size()
upload.save(update_fields=["size"])
except Exception as e:
self.stderr.write(
"[size] error with file #{}: {}".format(upload.pk, str(e))
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[size] {}/{} fixing file #{}".format(handled, total, upload.pk)
)
try:
upload.size = upload.get_file_size()
except Exception as e:
self.stderr.write(
"[size] error with file #{}: {}".format(upload.pk, str(e))
)
else:
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["size"])
def fix_file_checksum(self, dry_run, **kwargs):
self.stdout.write("Fixing missing checksums...")
matching = models.Upload.objects.filter(
Q(checksum=None)
& (Q(audio_file__isnull=False) | Q(source__startswith="file://"))
)
total = matching.count()
self.stdout.write(
"[checksum] {} entries found with missing values".format(total)
)
if dry_run:
return
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[checksum] {}/{} fixing file #{}".format(handled, total, upload.pk)
)
try:
upload.checksum = common_utils.get_file_hash(
upload.get_audio_file()
)
except Exception as e:
self.stderr.write(
"[checksum] error with file #{}: {}".format(upload.pk, str(e))
)
else:
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["checksum"])

Wyświetl plik

@ -1,17 +1,29 @@
import collections
import datetime
import itertools
import os
import urllib.parse
import queue
import threading
import time
import urllib.parse
import watchdog.events
import watchdog.observers
from django.conf import settings
from django.core.files import File
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from django.db.models import Q
from django.utils import timezone
from rest_framework import serializers
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music import models, tasks, utils
def crawl_dir(dir, extensions, recursive=True):
def crawl_dir(dir, extensions, recursive=True, ignored=[]):
if os.path.isfile(dir):
yield dir
return
@ -20,9 +32,12 @@ def crawl_dir(dir, extensions, recursive=True):
if entry.is_file():
for e in extensions:
if entry.name.lower().endswith(".{}".format(e.lower())):
yield entry.path
if entry.path not in ignored:
yield entry.path
elif recursive and entry.is_dir():
yield from crawl_dir(entry, extensions, recursive=recursive)
yield from crawl_dir(
entry, extensions, recursive=recursive, ignored=ignored
)
def batch(iterable, n=1):
@ -116,6 +131,17 @@ class Command(BaseCommand):
"of overhead on your server and on servers you are federating with."
),
)
parser.add_argument(
"--watch",
action="store_true",
dest="watch",
default=False,
help=(
"Start the command in watch mode. Instead of running a full import, "
"and exit, watch the given path and import new files, remove deleted "
"files, and update metadata corresponding to updated files."
),
)
parser.add_argument("-e", "--extension", nargs="+")
parser.add_argument(
@ -128,6 +154,15 @@ class Command(BaseCommand):
"This causes some overhead, so it's disabled by default."
),
)
parser.add_argument(
"--prune",
action="store_true",
dest="prune",
default=False,
help=(
"Once the import is completed, prune tracks, ablums and artists that aren't linked to any upload."
),
)
parser.add_argument(
"--reference",
@ -157,6 +192,8 @@ class Command(BaseCommand):
)
def handle(self, *args, **options):
# handle relative directories
options["path"] = [os.path.abspath(path) for path in options["path"]]
self.is_confirmed = False
try:
library = models.Library.objects.select_related("actor__user").get(
@ -182,22 +219,12 @@ class Command(BaseCommand):
)
if p and not import_path.startswith(p):
raise CommandError(
"Importing in-place only works if importing"
"Importing in-place only works if importing "
"from {} (MUSIC_DIRECTORY_PATH), as this directory"
"needs to be accessible by the webserver."
"Culprit: {}".format(p, import_path)
)
extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
crawler = itertools.chain(
*[
crawl_dir(p, extensions=extensions, recursive=options["recursive"])
for p in options["path"]
]
)
errors = []
total = 0
start_time = time.time()
reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())
import_url = "{}://{}/library/{}/upload?{}"
@ -212,8 +239,62 @@ class Command(BaseCommand):
reference, import_url
)
)
extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
if options["watch"]:
if len(options["path"]) > 1:
raise CommandError("Watch only work with a single directory")
return self.setup_watcher(
extensions=extensions,
path=options["path"][0],
reference=reference,
library=library,
in_place=options["in_place"],
prune=options["prune"],
recursive=options["recursive"],
replace=options["replace"],
dispatch_outbox=options["outbox"],
broadcast=options["broadcast"],
)
update = True
checked_paths = set()
if options["in_place"] and update:
self.stdout.write("Checking existing files for updates…")
message = (
"Are you sure you want to do this?\n\n"
"Type 'yes' to continue, or 'no' to skip checking for updates in "
"already imported files: "
)
if options["interactive"] and input("".join(message)) != "yes":
pass
else:
checked_paths = check_updates(
stdout=self.stdout,
paths=options["path"],
extensions=extensions,
library=library,
batch_size=options["batch_size"],
)
self.stdout.write("Existing files checked, moving on to next step!")
crawler = itertools.chain(
*[
crawl_dir(
p,
extensions=extensions,
recursive=options["recursive"],
ignored=checked_paths,
)
for p in options["path"]
]
)
errors = []
total = 0
start_time = time.time()
batch_start = None
batch_duration = None
self.stdout.write("Starting import of new files…")
for i, entries in enumerate(batch(crawler, options["batch_size"])):
total += len(entries)
batch_start = time.time()
@ -225,7 +306,7 @@ class Command(BaseCommand):
if entries:
self.stdout.write(
"Handling batch {} ({} items){}".format(
i + 1, options["batch_size"], time_stats,
i + 1, len(entries), time_stats,
)
)
batch_errors = self.handle_batch(
@ -240,9 +321,9 @@ class Command(BaseCommand):
batch_duration = time.time() - batch_start
message = "Successfully imported {} tracks in {}s"
message = "Successfully imported {} new tracks in {}s"
if options["async_"]:
message = "Successfully launched import for {} tracks in {}s"
message = "Successfully launched import for {} new tracks in {}s"
self.stdout.write(
message.format(total - len(errors), int(time.time() - start_time))
@ -259,6 +340,12 @@ class Command(BaseCommand):
)
)
if options["prune"]:
self.stdout.write(
"Pruning dangling tracks, albums and artists from library…"
)
prune()
def handle_batch(self, library, paths, batch, reference, options):
matching = []
for m in paths:
@ -362,15 +449,15 @@ class Command(BaseCommand):
message.format(batch=batch, path=path, i=i + 1, total=len(paths))
)
try:
self.create_upload(
path,
reference,
library,
async_,
options["replace"],
options["in_place"],
options["outbox"],
options["broadcast"],
create_upload(
path=path,
reference=reference,
library=library,
async_=async_,
replace=options["replace"],
in_place=options["in_place"],
dispatch_outbox=options["outbox"],
broadcast=options["broadcast"],
)
except Exception as e:
if options["exit_on_failure"]:
@ -382,34 +469,311 @@ class Command(BaseCommand):
errors.append((path, "{} {}".format(e.__class__.__name__, e)))
return errors
def create_upload(
self,
path,
reference,
library,
async_,
replace,
in_place,
dispatch_outbox,
broadcast,
):
import_handler = tasks.process_upload.delay if async_ else tasks.process_upload
upload = models.Upload(library=library, import_reference=reference)
upload.source = "file://" + path
upload.import_metadata = {
"funkwhale": {
"config": {
"replace": replace,
"dispatch_outbox": dispatch_outbox,
"broadcast": broadcast,
}
def setup_watcher(self, path, extensions, recursive, **kwargs):
watchdog_queue = queue.Queue()
# Set up a worker thread to process database load
worker = threading.Thread(
target=process_load_queue(self.stdout, **kwargs), args=(watchdog_queue,),
)
worker.setDaemon(True)
worker.start()
# setup watchdog to monitor directory for trigger files
patterns = ["*.{}".format(e) for e in extensions]
event_handler = Watcher(
stdout=self.stdout, queue=watchdog_queue, patterns=patterns,
)
observer = watchdog.observers.Observer()
observer.schedule(event_handler, path, recursive=recursive)
observer.start()
try:
while True:
self.stdout.write(
"Watching for changes at {}".format(path), ending="\r"
)
time.sleep(10)
if kwargs["prune"] and GLOBAL["need_pruning"]:
self.stdout.write("Some files were deleted, pruning library…")
prune()
GLOBAL["need_pruning"] = False
except KeyboardInterrupt:
self.stdout.write("Exiting…")
observer.stop()
observer.join()
GLOBAL = {"need_pruning": False}
def prune():
call_command(
"prune_library",
dry_run=False,
prune_artists=True,
prune_albums=True,
prune_tracks=True,
)
def create_upload(
path, reference, library, async_, replace, in_place, dispatch_outbox, broadcast,
):
import_handler = tasks.process_upload.delay if async_ else tasks.process_upload
upload = models.Upload(library=library, import_reference=reference)
upload.source = "file://" + path
upload.import_metadata = {
"funkwhale": {
"config": {
"replace": replace,
"dispatch_outbox": dispatch_outbox,
"broadcast": broadcast,
}
}
if not in_place:
name = os.path.basename(path)
with open(path, "rb") as f:
upload.audio_file.save(name, File(f), save=False)
}
if not in_place:
name = os.path.basename(path)
with open(path, "rb") as f:
upload.audio_file.save(name, File(f), save=False)
upload.save()
upload.save()
import_handler(upload_id=upload.pk)
import_handler(upload_id=upload.pk)
def process_load_queue(stdout, **kwargs):
def inner(q):
# we batch events, to avoid calling same methods multiple times if a file is modified
# a lot in a really short time
flush_delay = 2
batched_events = collections.OrderedDict()
while True:
while True:
if not q.empty():
event = q.get()
batched_events[event["path"]] = event
else:
break
for path, event in batched_events.copy().items():
if time.time() - event["time"] <= flush_delay:
continue
now = datetime.datetime.utcnow()
stdout.write(
"{} -- Processing {}:{}...\n".format(
now.strftime("%Y/%m/%d %H:%M:%S"), event["type"], event["path"]
)
)
del batched_events[path]
handle_event(event, stdout=stdout, **kwargs)
time.sleep(1)
return inner
class Watcher(watchdog.events.PatternMatchingEventHandler):
def __init__(self, stdout, queue, patterns):
self.stdout = stdout
self.queue = queue
super().__init__(patterns=patterns)
def enqueue(self, event):
e = {
"is_directory": event.is_directory,
"type": event.event_type,
"path": event.src_path,
"src_path": event.src_path,
"dest_path": getattr(event, "dest_path", None),
"time": time.time(),
}
self.queue.put(e)
def on_moved(self, event):
self.enqueue(event)
def on_created(self, event):
self.enqueue(event)
def on_deleted(self, event):
self.enqueue(event)
def on_modified(self, event):
self.enqueue(event)
def handle_event(event, stdout, **kwargs):
handlers = {
"modified": handle_modified,
"created": handle_created,
"moved": handle_moved,
"deleted": handle_deleted,
}
handlers[event["type"]](event=event, stdout=stdout, **kwargs)
def handle_modified(event, stdout, library, in_place, **kwargs):
existing_candidates = library.uploads.filter(import_status="finished")
with open(event["path"], "rb") as f:
checksum = common_utils.get_file_hash(f)
existing = existing_candidates.filter(checksum=checksum).first()
if existing:
# found an existing file with same checksum, nothing to do
stdout.write(" File already imported and metadata is up-to-date")
return
to_update = None
if in_place:
source = "file://{}".format(event["path"])
to_update = (
existing_candidates.in_place()
.filter(source=source)
.select_related(
"track__attributed_to", "track__artist", "track__album__artist",
)
.first()
)
if to_update:
if (
to_update.track.attributed_to
and to_update.track.attributed_to != library.actor
):
stdout.write(
" Cannot update track metadata, track belongs to someone else".format(
to_update.pk
)
)
return
else:
stdout.write(
" Updating existing file #{} with new metadata…".format(
to_update.pk
)
)
audio_metadata = to_update.get_metadata()
try:
tasks.update_track_metadata(audio_metadata, to_update.track)
except serializers.ValidationError as e:
stdout.write(" Invalid metadata: {}".format(e))
else:
to_update.checksum = checksum
to_update.save(update_fields=["checksum"])
return
stdout.write(" Launching import for new file")
create_upload(
path=event["path"],
reference=kwargs["reference"],
library=library,
async_=False,
replace=kwargs["replace"],
in_place=in_place,
dispatch_outbox=kwargs["dispatch_outbox"],
broadcast=kwargs["broadcast"],
)
def handle_created(event, stdout, **kwargs):
"""
Created is essentially an alias for modified, because for instance when copying a file in the watched directory,
a created event will be fired on the initial touch, then many modified event (as the file is written).
"""
return handle_modified(event, stdout, **kwargs)
def handle_moved(event, stdout, library, in_place, **kwargs):
if not in_place:
return
old_source = "file://{}".format(event["src_path"])
new_source = "file://{}".format(event["dest_path"])
existing_candidates = library.uploads.filter(import_status="finished")
existing_candidates = existing_candidates.in_place().filter(source=old_source)
existing = existing_candidates.first()
if existing:
stdout.write(" Updating path of existing file #{}".format(existing.pk))
existing.source = new_source
existing.save(update_fields=["source"])
def handle_deleted(event, stdout, library, in_place, **kwargs):
if not in_place:
return
source = "file://{}".format(event["path"])
existing_candidates = library.uploads.filter(import_status="finished")
existing_candidates = existing_candidates.in_place().filter(source=source)
if existing_candidates.count():
stdout.write(" Removing file from DB")
existing_candidates.delete()
GLOBAL["need_pruning"] = True
def check_updates(stdout, library, extensions, paths, batch_size):
existing = (
library.uploads.in_place()
.filter(import_status="finished")
.exclude(checksum=None)
.select_related("library", "track")
)
queries = []
checked_paths = set()
for path in paths:
for ext in extensions:
queries.append(
Q(source__startswith="file://{}".format(path))
& Q(source__endswith=".{}".format(ext))
)
query, remainder = queries[0], queries[1:]
for q in remainder:
query = q | query
existing = existing.filter(query)
total = existing.count()
stdout.write("Found {} files to check in database!".format(total))
uploads = existing.order_by("source")
for i, rows in enumerate(batch(uploads.iterator(), batch_size)):
stdout.write("Handling batch {} ({} items)".format(i + 1, len(rows),))
for upload in rows:
check_upload(stdout, upload)
checked_paths.add(upload.source.replace("file://", "", 1))
return checked_paths
def check_upload(stdout, upload):
try:
audio_file = upload.get_audio_file()
except FileNotFoundError:
stdout.write(
" Removing file #{} missing from disk at {}".format(
upload.pk, upload.source
)
)
return upload.delete()
checksum = common_utils.get_file_hash(audio_file)
if upload.checksum != checksum:
stdout.write(
" File #{} at {} was modified, updating metadata…".format(
upload.pk, upload.source
)
)
if upload.library.actor_id != upload.track.attributed_to_id:
stdout.write(
" Cannot update track metadata, track belongs to someone else".format(
upload.pk
)
)
else:
track = models.Track.objects.select_related("artist", "album__artist").get(
pk=upload.track_id
)
try:
tasks.update_track_metadata(upload.get_metadata(), track)
except serializers.ValidationError as e:
stdout.write(" Invalid metadata: {}".format(e))
return
else:
upload.checksum = checksum
upload.save(update_fields=["checksum"])

Wyświetl plik

@ -0,0 +1,23 @@
# Generated by Django 3.0.4 on 2020-05-05 08:10
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('music', '0051_auto_20200319_1249'),
]
operations = [
migrations.AddField(
model_name='upload',
name='checksum',
field=models.CharField(blank=True, db_index=True, max_length=100, null=True),
),
migrations.AlterField(
model_name='uploadversion',
name='mimetype',
field=models.CharField(choices=[('audio/mp3', 'mp3'), ('audio/mpeg3', 'mp3'), ('audio/x-mp3', 'mp3'), ('audio/mpeg', 'mp3'), ('video/ogg', 'ogg'), ('audio/ogg', 'ogg'), ('audio/opus', 'opus'), ('audio/x-m4a', 'aac'), ('audio/x-m4a', 'm4a'), ('audio/x-flac', 'flac'), ('audio/flac', 'flac')], max_length=50),
),
]

Wyświetl plik

@ -655,6 +655,14 @@ class Track(APIModelMixin):
class UploadQuerySet(common_models.NullsLastQuerySet):
def in_place(self, include=True):
query = models.Q(source__startswith="file://") & (
models.Q(audio_file="") | models.Q(audio_file=None)
)
if not include:
query = ~query
return self.filter(query)
def playable_by(self, actor, include=True):
libraries = Library.objects.viewable_by(actor)
@ -754,6 +762,9 @@ class Upload(models.Model):
)
downloads_count = models.PositiveIntegerField(default=0)
# stores checksums such as `sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
checksum = models.CharField(max_length=100, db_index=True, null=True, blank=True)
objects = UploadQuerySet.as_manager()
@property
@ -833,7 +844,7 @@ class Upload(models.Model):
def get_audio_file(self):
if self.audio_file:
return self.audio_file.open()
if self.source.startswith("file://"):
if self.source and self.source.startswith("file://"):
return open(self.source.replace("file://", "", 1), "rb")
def get_audio_data(self):
@ -866,6 +877,15 @@ class Upload(models.Model):
self.mimetype = mimetypes.guess_type(self.source)[0]
if not self.size and self.audio_file:
self.size = self.audio_file.size
if not self.checksum:
try:
audio_file = self.get_audio_file()
except FileNotFoundError:
pass
else:
if audio_file:
self.checksum = common_utils.get_file_hash(audio_file)
if not self.pk and not self.fid and self.library.actor.get_user():
self.fid = self.get_federation_id()
return super().save(**kwargs)

Wyświetl plik

@ -851,3 +851,71 @@ def update_library_entity(obj, data):
obj.save(update_fields=list(data.keys()))
return obj
UPDATE_CONFIG = {
"track": {
"position": {},
"title": {},
"mbid": {},
"disc_number": {},
"copyright": {},
"license": {
"getter": lambda data, field: licenses.match(
data.get("license"), data.get("copyright")
)
},
},
"album": {"title": {}, "mbid": {}, "release_date": {}},
"artist": {"name": {}, "mbid": {}},
"album_artist": {"name": {}, "mbid": {}},
}
@transaction.atomic
def update_track_metadata(audio_metadata, track):
# XXX: implement this to support updating metadata when an imported files
# is updated by an outside tool (e.g beets).
serializer = metadata.TrackMetadataSerializer(data=audio_metadata)
serializer.is_valid(raise_exception=True)
new_data = serializer.validated_data
to_update = [
("track", track, lambda data: data),
("album", track.album, lambda data: data["album"]),
("artist", track.artist, lambda data: data["artists"][0]),
(
"album_artist",
track.album.artist if track.album else None,
lambda data: data["album"]["artists"][0],
),
]
for id, obj, data_getter in to_update:
if not obj:
continue
obj_updated_fields = []
try:
obj_data = data_getter(new_data)
except IndexError:
continue
for field, config in UPDATE_CONFIG[id].items():
getter = config.get(
"getter", lambda data, field: data[config.get("field", field)]
)
try:
new_value = getter(obj_data, field)
except KeyError:
continue
old_value = getattr(obj, field)
if new_value == old_value:
continue
obj_updated_fields.append(field)
setattr(obj, field, new_value)
if obj_updated_fields:
obj.save(update_fields=obj_updated_fields)
if track.album and "album" in new_data and new_data["album"].get("cover_data"):
common_utils.attach_file(
track.album, "attachment_cover", new_data["album"].get("cover_data")
)

Wyświetl plik

@ -83,3 +83,4 @@ service_identity==18.1.0
markdown>=3.2,<4
bleach>=3,<4
feedparser==6.0.0b3
watchdog==0.10.2

Wyświetl plik

@ -258,3 +258,12 @@ def test_monkey_patch_request_build_absolute_uri(
request = fake_request.get("/", **meta)
assert request.build_absolute_uri(path) == expected
def test_get_file_hash(tmpfile, settings):
settings.HASHING_ALGORITHM = "sha256"
content = b"hello"
tmpfile.write(content)
# echo -n "hello" | sha256sum
expected = "sha256:2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824"
assert utils.get_file_hash(tmpfile) == expected

Wyświetl plik

@ -1,6 +1,7 @@
import os
import pytest
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music.management.commands import check_inplace_files
from funkwhale_api.music.management.commands import fix_uploads
from funkwhale_api.music.management.commands import prune_library
@ -18,7 +19,7 @@ def test_fix_uploads_bitrate_length(factories, mocker):
return_value={"bitrate": 42, "length": 43},
)
c.fix_file_data(dry_run=False)
c.fix_file_data(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
@ -41,7 +42,7 @@ def test_fix_uploads_size(factories, mocker):
mocker.patch("funkwhale_api.music.models.Upload.get_file_size", return_value=2)
c.fix_file_size(dry_run=False)
c.fix_file_size(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
@ -69,7 +70,7 @@ def test_fix_uploads_mimetype(factories, mocker):
mimetype="audio/something",
)
c = fix_uploads.Command()
c.fix_mimetypes(dry_run=False)
c.fix_mimetypes(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
@ -78,6 +79,25 @@ def test_fix_uploads_mimetype(factories, mocker):
assert upload2.mimetype == "audio/something"
def test_fix_uploads_checksum(factories, mocker):
upload1 = factories["music.Upload"]()
upload2 = factories["music.Upload"]()
upload1.__class__.objects.filter(pk=upload1.pk).update(checksum="test")
upload2.__class__.objects.filter(pk=upload2.pk).update(checksum=None)
c = fix_uploads.Command()
c.fix_file_checksum(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
# not updated
assert upload1.checksum == "test"
# updated
assert upload2.checksum == common_utils.get_file_hash(upload2.audio_file)
def test_prune_library_dry_run(factories):
prunable = factories["music.Track"]()
not_prunable = factories["music.Track"]()

Wyświetl plik

@ -5,6 +5,7 @@ import pytest
from django.utils import timezone
from django.urls import reverse
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music import importers, models, tasks
from funkwhale_api.federation import utils as federation_utils
@ -164,6 +165,17 @@ def test_audio_track_mime_type(extention, mimetype, factories):
assert upload.mimetype == mimetype
@pytest.mark.parametrize("name", ["test.ogg", "test.mp3"])
def test_audio_track_checksum(name, factories):
path = os.path.join(DATA_DIR, name)
upload = factories["music.Upload"](audio_file__from_path=path, mimetype=None)
with open(path, "rb") as f:
expected = common_utils.get_file_hash(f)
assert upload.checksum == expected
def test_upload_file_name(factories):
name = "test.mp3"
path = os.path.join(DATA_DIR, name)

Wyświetl plik

@ -1329,3 +1329,40 @@ def test_can_import_track_with_same_position_in_same_discs_skipped(factories, mo
new_upload.refresh_from_db()
assert new_upload.import_status == "skipped"
def test_update_track_metadata(factories):
track = factories["music.Track"]()
data = {
"title": "Peer Gynt Suite no. 1, op. 46: I. Morning",
"artist": "Edvard Grieg",
"album_artist": "Edvard Grieg; Musopen Symphony Orchestra",
"album": "Peer Gynt Suite no. 1, op. 46",
"date": "2012-08-15",
"position": "4",
"disc_number": "2",
"musicbrainz_albumid": "a766da8b-8336-47aa-a3ee-371cc41ccc75",
"mbid": "bd21ac48-46d8-4e78-925f-d9cc2a294656",
"musicbrainz_artistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823",
"musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f",
"license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "Someone",
"comment": "hello there",
}
tasks.update_track_metadata(metadata.FakeMetadata(data), track)
track.refresh_from_db()
assert track.title == data["title"]
assert track.position == int(data["position"])
assert track.disc_number == int(data["disc_number"])
assert track.license.code == "cc-by-sa-4.0"
assert track.copyright == data["copyright"]
assert str(track.mbid) == data["mbid"]
assert track.album.title == data["album"]
assert track.album.release_date == datetime.date(2012, 8, 15)
assert str(track.album.mbid) == data["musicbrainz_albumid"]
assert track.artist.name == data["artist"]
assert str(track.artist.mbid) == data["musicbrainz_artistid"]
assert track.album.artist.name == "Edvard Grieg"
assert str(track.album.artist.mbid) == "013c8e5b-d72a-4cd3-8dee-6c64d6125823"

Wyświetl plik

@ -4,6 +4,8 @@ import pytest
from django.core.management import call_command
from django.core.management.base import CommandError
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music.management.commands import import_files
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "files")
@ -159,3 +161,194 @@ def test_import_files_in_place(factories, mocker, settings):
def test_storage_rename_utf_8_files(factories):
upload = factories["music.Upload"](audio_file__filename="été.ogg")
assert upload.audio_file.name.endswith("ete.ogg")
@pytest.mark.parametrize("name", ["modified", "moved", "created", "deleted"])
def test_handle_event(name, mocker):
handler = mocker.patch.object(import_files, "handle_{}".format(name))
event = {"type": name}
stdout = mocker.Mock()
kwargs = {"hello": "world"}
import_files.handle_event(event, stdout, **kwargs)
handler.assert_called_once_with(event=event, stdout=stdout, **kwargs)
def test_handle_created(mocker):
handle_modified = mocker.patch.object(import_files, "handle_modified")
event = mocker.Mock()
stdout = mocker.Mock()
kwargs = {"hello": "world"}
import_files.handle_created(event, stdout, **kwargs)
handle_modified.assert_called_once_with(event, stdout, **kwargs)
def test_handle_deleted(factories, mocker):
stdout = mocker.Mock()
event = {
"path": "/path.mp3",
}
library = factories["music.Library"]()
deleted = factories["music.Upload"](
library=library,
source="file://{}".format(event["path"]),
import_status="finished",
audio_file=None,
)
kept = [
factories["music.Upload"](
library=library,
source="file://{}".format(event["path"]),
import_status="finished",
),
factories["music.Upload"](
source="file://{}".format(event["path"]),
import_status="finished",
audio_file=None,
),
]
import_files.handle_deleted(
event=event, stdout=stdout, library=library, in_place=True
)
with pytest.raises(deleted.DoesNotExist):
deleted.refresh_from_db()
for upload in kept:
upload.refresh_from_db()
def test_handle_moved(factories, mocker):
stdout = mocker.Mock()
event = {
"src_path": "/path.mp3",
"dest_path": "/new_path.mp3",
}
library = factories["music.Library"]()
updated = factories["music.Upload"](
library=library,
source="file://{}".format(event["src_path"]),
import_status="finished",
audio_file=None,
)
untouched = [
factories["music.Upload"](
library=library,
source="file://{}".format(event["src_path"]),
import_status="finished",
),
factories["music.Upload"](
source="file://{}".format(event["src_path"]),
import_status="finished",
audio_file=None,
),
]
import_files.handle_moved(
event=event, stdout=stdout, library=library, in_place=True
)
updated.refresh_from_db()
assert updated.source == "file://{}".format(event["dest_path"])
for upload in untouched:
source = upload.source
upload.refresh_from_db()
assert source == upload.source
def test_handle_modified_creates_upload(tmpfile, factories, mocker):
stdout = mocker.Mock()
event = {
"path": tmpfile.name,
}
process_upload = mocker.patch("funkwhale_api.music.tasks.process_upload")
library = factories["music.Library"]()
import_files.handle_modified(
event=event,
stdout=stdout,
library=library,
in_place=True,
reference="hello",
replace=False,
dispatch_outbox=False,
broadcast=False,
)
upload = library.uploads.latest("id")
assert upload.source == "file://{}".format(event["path"])
process_upload.assert_called_once_with(upload_id=upload.pk)
def test_handle_modified_skips_existing_checksum(tmpfile, factories, mocker):
stdout = mocker.Mock()
event = {
"path": tmpfile.name,
}
tmpfile.write(b"hello")
library = factories["music.Library"]()
factories["music.Upload"](
checksum=common_utils.get_file_hash(tmpfile),
library=library,
import_status="finished",
)
import_files.handle_modified(
event=event, stdout=stdout, library=library, in_place=True,
)
assert library.uploads.count() == 1
def test_handle_modified_update_existing_path_if_found(tmpfile, factories, mocker):
stdout = mocker.Mock()
event = {
"path": tmpfile.name,
}
update_track_metadata = mocker.patch(
"funkwhale_api.music.tasks.update_track_metadata"
)
get_metadata = mocker.patch("funkwhale_api.music.models.Upload.get_metadata")
library = factories["music.Library"]()
track = factories["music.Track"](attributed_to=library.actor)
upload = factories["music.Upload"](
source="file://{}".format(event["path"]),
track=track,
checksum="old",
library=library,
import_status="finished",
audio_file=None,
)
import_files.handle_modified(
event=event, stdout=stdout, library=library, in_place=True,
)
update_track_metadata.assert_called_once_with(
get_metadata.return_value, upload.track,
)
def test_handle_modified_update_existing_path_if_found_and_attributed_to(
tmpfile, factories, mocker
):
stdout = mocker.Mock()
event = {
"path": tmpfile.name,
}
update_track_metadata = mocker.patch(
"funkwhale_api.music.tasks.update_track_metadata"
)
library = factories["music.Library"]()
factories["music.Upload"](
source="file://{}".format(event["path"]),
checksum="old",
library=library,
track__attributed_to=factories["federation.Actor"](),
import_status="finished",
audio_file=None,
)
import_files.handle_modified(
event=event, stdout=stdout, library=library, in_place=True,
)
update_track_metadata.assert_not_called()

Wyświetl plik

@ -1 +1 @@
Fixed mimetype detection issue that broke transcoding on some tracks (#1093). Run ``python manage.py fix_uploads --mimetypes`` to set proper mimetypes on existing uploads.
Fixed mimetype detection issue that broke transcoding on some tracks (#1093). Run ``python manage.py fix_uploads --mimetype`` to set proper mimetypes on existing uploads.

Wyświetl plik

@ -0,0 +1 @@
Support a --watch mode with ``import_files`` to automatically add, update and remove files when filesystem is updated (#721)

Wyświetl plik

@ -1,15 +1,21 @@
Importing music
================
Importing music from the server
===============================
From music directory on the server
----------------------------------
You can import music files in Funkwhale assuming they are located on the server
and readable by the Funkwhale application. Your music files should contain at
Funkwhale can import music files that are located on the server assuming
they readable by the Funkwhale application. Your music files should contain at
least an ``artist``, ``album`` and ``title`` tags, but we recommend you tag
it extensively using a proper tool, such as Beets or Musicbrainz Picard.
You can import those tracks as follows, assuming they are located in
Funkwhale supports two different import modes:
- copy (the default): files are copied into Funkwhale's internal storage. This means importing a 1GB library will result in the same amount of space being used by Funkwhale.
- :ref:`in-place <in-place-import>` (when the ``--in-place`` is provided): files are referenced in Funkwhale's DB but not copied or touched in anyway. This is useful if you have a huge library, or one that is updated by an external tool such as Beets..
.. note::
In Funkwhale 1.0, **the default behaviour will change to in-place import**
Regardless of the mode you're choosing, import works as described below, assuming your files are located in
``/srv/funkwhale/data/music``:
.. code-block:: bash
@ -17,6 +23,17 @@ You can import those tracks as follows, assuming they are located in
export LIBRARY_ID="<your_libary_id>"
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
.. note::
You'll have to create a library in the Web UI before to get your library ID. Simply visit
https://yourdomain/content/libraries/ to create one.
Library IDs are available in library urls or sharing link. In this example:
https://funkwhale.instance/content/libraries/769a2ae3-eb3d-4aff-9f94-2c4d80d5c2d1,
the library ID is 769a2bc3-eb1d-4aff-9f84-2c4d80d5c2d1
You can use only the first characters of the ID when calling the command, like that:
``export LIBRARY_ID="769a2bc3"``
When you use docker, the ``/srv/funkwhale/data/music`` is mounted from the host
to the ``/music`` directory on the container:
@ -32,16 +49,6 @@ When you installed Funkwhale via ansible, you need to call a script instead of P
export LIBRARY_ID="<your_libary_id>"
/srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
.. note::
You'll have to create a library in the Web UI before to get your library ID. Simply visit
https://yourdomain/content/libraries/ to create one.
Library IDs are available in library urls or sharing link. In this example:
https://funkwhale.instance/content/libraries/769a2ae3-eb3d-4aff-9f94-2c4d80d5c2d1,
the library ID is 769a2bc3-eb1d-4aff-9f84-2c4d80d5c2d1
You can use only the first characters of the ID when calling the command, like that:
``export LIBRARY_ID="769a2bc3"``
The import command supports several options, and you can check the help to
get details::
@ -63,6 +70,7 @@ get details::
At the moment, only Flac, OGG/Vorbis and MP3 files with ID3 tags are supported
.. _in-place-import:
In-place import
@ -88,14 +96,6 @@ configuration options to ensure the webserver can serve them properly:
- :ref:`setting-MUSIC_DIRECTORY_PATH`
- :ref:`setting-MUSIC_DIRECTORY_SERVE_PATH`
.. warning::
While in-place import is faster and less disk-space-hungry, it's also
more fragile: if, for some reason, you move or rename the source files,
Funkwhale will not be able to serve those files anymore.
Thus, be especially careful when you manipulate the source files.
We recommend you symlink all your music directories into ``/srv/funkwhale/data/music``
and run the `import_files` command from that directory. This will make it possible
to use multiple music directories, without any additional configuration
@ -134,6 +134,49 @@ If you want to go with symlinks, ensure each symlinked directory is mounted as a
# add your symlinked dirs here
- /media/nfsshare:/media/nfsshare:ro
Metadata updates
^^^^^^^^^^^^^^^^
When doing an import with in ``in-place`` mode, the importer will also check and update existing entries
found in the database. For instance, if a file was imported, the ID3 Title tag was updated, and you rerun a scan,
Funkwhale will pick up the new title. The following fields can be updated this way:
- Track mbid
- Track title
- Track position and disc number
- Track license and copyright
- Album cover
- Album title
- Album mbid
- Album release date
- Artist name
- Artist mbid
- Album artist name
- Album artist mbid
React to filesystem events with ``--watch``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If you have a really big library or one that is updated quite often, running the ``import_files`` command by hand
may not be practical. To help with this use case, the ``import_files`` command supports a ``--watch`` flag that will observes filesystem events
instead of performing a full import.
File creation, move, update and removal are handled when ``--watch`` is provided:
- Files created in the watched directory are imported immediatly
- If using ``in-place`` mode, files updates trigger a metadata update on the corresponding entries
- If using ``in-place`` mode, files that are moved and known by Funkwhale will see their path updated in Funkwhale's DB
- If using ``in-place`` mode, files that are removed and known by Funkwhale will be removed from Funkwhale's DB
Pruning dangling metadata with ``--prune``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Funkwhale is, by design, conservative with music metadata in its database. If you remove a file from Funkwhale's DB,
the corresponding artist, album and track object won't be deleted by default.
If you want to prune dangling metadata from the database once the ``import_files`` command is over, simply add the ``--prune`` flag.
This also works in with ``--watch``.
Album covers
^^^^^^^^^^^^
@ -159,9 +202,3 @@ under creative commons (courtesy of Jamendo):
./download-tracks.sh music.txt
This will download a bunch of zip archives (one per album) under the ``data/music`` directory and unzip their content.
From other instances
--------------------
Funkwhale also supports importing music from other instances. Please refer
to :doc:`../federation/index` for more details.