kopia lustrzana https://dev.funkwhale.audio/funkwhale/funkwhale
Improved CLI importer reliability and UX
rodzic
6678c46d88
commit
ecb7c464a1
|
@ -47,7 +47,7 @@ class Command(BaseCommand):
|
|||
self.stdout.write("Checking {} in-place imported files…".format(total))
|
||||
|
||||
missing = []
|
||||
for i, row in enumerate(candidates.values("id", "source")):
|
||||
for i, row in enumerate(candidates.values("id", "source").iterator()):
|
||||
path = row["source"].replace("file://", "")
|
||||
progress(self.stdout, i + 1, total)
|
||||
if not os.path.exists(path):
|
||||
|
|
|
@ -1,13 +1,40 @@
|
|||
import glob
|
||||
import itertools
|
||||
import os
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.files import File
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.utils import timezone
|
||||
|
||||
from funkwhale_api.music import models, tasks
|
||||
from funkwhale_api.music import models, tasks, utils
|
||||
|
||||
|
||||
def crawl_dir(dir, extensions, recursive=True):
|
||||
if os.path.isfile(dir):
|
||||
yield dir
|
||||
return
|
||||
with os.scandir(dir) as scanner:
|
||||
for entry in scanner:
|
||||
if entry.is_file():
|
||||
for e in extensions:
|
||||
if entry.name.lower().endswith(".{}".format(e.lower())):
|
||||
yield entry.path
|
||||
elif recursive and entry.is_dir():
|
||||
yield from crawl_dir(entry, extensions, recursive=recursive)
|
||||
|
||||
|
||||
def batch(iterable, n=1):
|
||||
has_entries = True
|
||||
while has_entries:
|
||||
current = []
|
||||
for i in range(0, n):
|
||||
try:
|
||||
current.append(next(iterable))
|
||||
except StopIteration:
|
||||
has_entries = False
|
||||
yield current
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
@ -89,6 +116,7 @@ class Command(BaseCommand):
|
|||
"of overhead on your server and on servers you are federating with."
|
||||
),
|
||||
)
|
||||
parser.add_argument("-e", "--extension", nargs="+")
|
||||
|
||||
parser.add_argument(
|
||||
"--broadcast",
|
||||
|
@ -119,10 +147,17 @@ class Command(BaseCommand):
|
|||
help="Do NOT prompt the user for input of any kind.",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
glob_kwargs = {}
|
||||
matching = []
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
"-s",
|
||||
dest="batch_size",
|
||||
default=1000,
|
||||
type=int,
|
||||
help="Size of each batch, only used when crawling large collections",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.is_confirmed = False
|
||||
try:
|
||||
library = models.Library.objects.select_related("actor__user").get(
|
||||
uuid__startswith=options["library_id"]
|
||||
|
@ -133,14 +168,100 @@ class Command(BaseCommand):
|
|||
if not library.actor.get_user():
|
||||
raise CommandError("Library {} is not a local library".format(library.uuid))
|
||||
|
||||
if options["recursive"]:
|
||||
glob_kwargs["recursive"] = True
|
||||
for import_path in options["path"]:
|
||||
matching += glob.glob(import_path, **glob_kwargs)
|
||||
raw_matching = sorted(list(set(matching)))
|
||||
if options["in_place"]:
|
||||
self.stdout.write(
|
||||
"Checking imported paths against settings.MUSIC_DIRECTORY_PATH"
|
||||
)
|
||||
|
||||
for import_path in options["path"]:
|
||||
p = settings.MUSIC_DIRECTORY_PATH
|
||||
if not p:
|
||||
raise CommandError(
|
||||
"Importing in-place requires setting the "
|
||||
"MUSIC_DIRECTORY_PATH variable"
|
||||
)
|
||||
if p and not import_path.startswith(p):
|
||||
raise CommandError(
|
||||
"Importing in-place only works if importing"
|
||||
"from {} (MUSIC_DIRECTORY_PATH), as this directory"
|
||||
"needs to be accessible by the webserver."
|
||||
"Culprit: {}".format(p, import_path)
|
||||
)
|
||||
|
||||
extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
|
||||
crawler = itertools.chain(
|
||||
*[
|
||||
crawl_dir(p, extensions=extensions, recursive=options["recursive"])
|
||||
for p in options["path"]
|
||||
]
|
||||
)
|
||||
errors = []
|
||||
total = 0
|
||||
start_time = time.time()
|
||||
reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())
|
||||
|
||||
import_url = "{}://{}/content/libraries/{}/upload?{}"
|
||||
import_url = import_url.format(
|
||||
settings.FUNKWHALE_PROTOCOL,
|
||||
settings.FUNKWHALE_HOSTNAME,
|
||||
str(library.uuid),
|
||||
urllib.parse.urlencode([("import", reference)]),
|
||||
)
|
||||
self.stdout.write(
|
||||
"For details, please refer to import reference '{}' or URL {}".format(
|
||||
reference, import_url
|
||||
)
|
||||
)
|
||||
batch_start = None
|
||||
batch_duration = None
|
||||
for i, entries in enumerate(batch(crawler, options["batch_size"])):
|
||||
total += len(entries)
|
||||
batch_start = time.time()
|
||||
time_stats = ""
|
||||
if i > 0:
|
||||
time_stats = " - running for {}s, previous batch took {}s".format(
|
||||
int(time.time() - start_time), int(batch_duration),
|
||||
)
|
||||
if entries:
|
||||
self.stdout.write(
|
||||
"Handling batch {} ({} items){}".format(
|
||||
i + 1, options["batch_size"], time_stats,
|
||||
)
|
||||
)
|
||||
batch_errors = self.handle_batch(
|
||||
library=library,
|
||||
paths=entries,
|
||||
batch=i + 1,
|
||||
reference=reference,
|
||||
options=options,
|
||||
)
|
||||
if batch_errors:
|
||||
errors += batch_errors
|
||||
|
||||
batch_duration = time.time() - batch_start
|
||||
|
||||
message = "Successfully imported {} tracks in {}s"
|
||||
if options["async_"]:
|
||||
message = "Successfully launched import for {} tracks in {}s"
|
||||
|
||||
self.stdout.write(
|
||||
message.format(total - len(errors), int(time.time() - start_time))
|
||||
)
|
||||
if len(errors) > 0:
|
||||
self.stderr.write("{} tracks could not be imported:".format(len(errors)))
|
||||
|
||||
for path, error in errors:
|
||||
self.stderr.write("- {}: {}".format(path, error))
|
||||
|
||||
self.stdout.write(
|
||||
"For details, please refer to import reference '{}' or URL {}".format(
|
||||
reference, import_url
|
||||
)
|
||||
)
|
||||
|
||||
def handle_batch(self, library, paths, batch, reference, options):
|
||||
matching = []
|
||||
for m in raw_matching:
|
||||
for m in paths:
|
||||
# In some situations, the path is encoded incorrectly on the filesystem
|
||||
# so we filter out faulty paths and display a warning to the user.
|
||||
# see https://dev.funkwhale.audio/funkwhale/funkwhale/issues/138
|
||||
|
@ -160,96 +281,57 @@ class Command(BaseCommand):
|
|||
)
|
||||
)
|
||||
|
||||
if options["in_place"]:
|
||||
self.stdout.write(
|
||||
"Checking imported paths against settings.MUSIC_DIRECTORY_PATH"
|
||||
)
|
||||
p = settings.MUSIC_DIRECTORY_PATH
|
||||
if not p:
|
||||
raise CommandError(
|
||||
"Importing in-place requires setting the "
|
||||
"MUSIC_DIRECTORY_PATH variable"
|
||||
)
|
||||
for m in matching:
|
||||
if not m.startswith(p):
|
||||
raise CommandError(
|
||||
"Importing in-place only works if importing"
|
||||
"from {} (MUSIC_DIRECTORY_PATH), as this directory"
|
||||
"needs to be accessible by the webserver."
|
||||
"Culprit: {}".format(p, m)
|
||||
)
|
||||
if not matching:
|
||||
raise CommandError("No file matching pattern, aborting")
|
||||
|
||||
if options["replace"]:
|
||||
filtered = {"initial": matching, "skipped": [], "new": matching}
|
||||
message = "- {} files to be replaced"
|
||||
message = " - {} files to be replaced"
|
||||
import_paths = matching
|
||||
else:
|
||||
filtered = self.filter_matching(matching, library)
|
||||
message = "- {} files already found in database"
|
||||
message = " - {} files already found in database"
|
||||
import_paths = filtered["new"]
|
||||
|
||||
self.stdout.write("Import summary:")
|
||||
self.stdout.write(" Import summary:")
|
||||
self.stdout.write(
|
||||
"- {} files found matching this pattern: {}".format(
|
||||
" - {} files found matching this pattern: {}".format(
|
||||
len(matching), options["path"]
|
||||
)
|
||||
)
|
||||
self.stdout.write(message.format(len(filtered["skipped"])))
|
||||
|
||||
self.stdout.write("- {} new files".format(len(filtered["new"])))
|
||||
self.stdout.write(" - {} new files".format(len(filtered["new"])))
|
||||
|
||||
self.stdout.write(
|
||||
"Selected options: {}".format(
|
||||
", ".join(["in place" if options["in_place"] else "copy music files"])
|
||||
if batch == 1:
|
||||
self.stdout.write(
|
||||
" Selected options: {}".format(
|
||||
", ".join(
|
||||
["in place" if options["in_place"] else "copy music files"]
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
if len(filtered["new"]) == 0:
|
||||
self.stdout.write("Nothing new to import, exiting")
|
||||
self.stdout.write(" Nothing new to import, exiting")
|
||||
return
|
||||
|
||||
if options["interactive"]:
|
||||
if options["interactive"] and not self.is_confirmed:
|
||||
message = (
|
||||
"Are you sure you want to do this?\n\n"
|
||||
"Type 'yes' to continue, or 'no' to cancel: "
|
||||
)
|
||||
if input("".join(message)) != "yes":
|
||||
raise CommandError("Import cancelled.")
|
||||
reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())
|
||||
|
||||
import_url = "{}://{}/content/libraries/{}/upload?{}"
|
||||
import_url = import_url.format(
|
||||
settings.FUNKWHALE_PROTOCOL,
|
||||
settings.FUNKWHALE_HOSTNAME,
|
||||
str(library.uuid),
|
||||
urllib.parse.urlencode([("import", reference)]),
|
||||
)
|
||||
self.stdout.write(
|
||||
"For details, please refer to import reference '{}' or URL {}".format(
|
||||
reference, import_url
|
||||
)
|
||||
)
|
||||
self.is_confirmed = True
|
||||
|
||||
errors = self.do_import(
|
||||
import_paths, library=library, reference=reference, options=options
|
||||
)
|
||||
message = "Successfully imported {} tracks"
|
||||
if options["async_"]:
|
||||
message = "Successfully launched import for {} tracks"
|
||||
|
||||
self.stdout.write(message.format(len(import_paths)))
|
||||
if len(errors) > 0:
|
||||
self.stderr.write("{} tracks could not be imported:".format(len(errors)))
|
||||
|
||||
for path, error in errors:
|
||||
self.stderr.write("- {}: {}".format(path, error))
|
||||
|
||||
self.stdout.write(
|
||||
"For details, please refer to import reference '{}' or URL {}".format(
|
||||
reference, import_url
|
||||
)
|
||||
import_paths,
|
||||
library=library,
|
||||
reference=reference,
|
||||
batch=batch,
|
||||
options=options,
|
||||
)
|
||||
return errors
|
||||
|
||||
def filter_matching(self, matching, library):
|
||||
sources = ["file://{}".format(p) for p in matching]
|
||||
|
@ -266,17 +348,20 @@ class Command(BaseCommand):
|
|||
}
|
||||
return result
|
||||
|
||||
def do_import(self, paths, library, reference, options):
|
||||
message = "{i}/{total} Importing {path}..."
|
||||
def do_import(self, paths, library, reference, batch, options):
|
||||
message = "[batch {batch}] {i}/{total} Importing {path}..."
|
||||
if options["async_"]:
|
||||
message = "{i}/{total} Launching import for {path}..."
|
||||
message = "[batch {batch}] {i}/{total} Launching import for {path}..."
|
||||
|
||||
# we create an upload binded to the library
|
||||
async_ = options["async_"]
|
||||
errors = []
|
||||
for i, path in list(enumerate(paths)):
|
||||
if options["verbosity"] > 1:
|
||||
self.stdout.write(
|
||||
message.format(batch=batch, path=path, i=i + 1, total=len(paths))
|
||||
)
|
||||
try:
|
||||
self.stdout.write(message.format(path=path, i=i + 1, total=len(paths)))
|
||||
self.create_upload(
|
||||
path,
|
||||
reference,
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
CLI Importer is now more reliable and less resource-hungry on large libraries
|
|
@ -5,3 +5,20 @@ Next release notes
|
|||
|
||||
Those release notes refer to the current development branch and are reset
|
||||
after each release.
|
||||
|
||||
More reliable CLI importer
|
||||
--------------------------
|
||||
|
||||
Our CLI importer is now more reliable and less prone to Out-of-Memory issues, especially when scanning large libraries. (hundreds of GB or bigger)
|
||||
|
||||
We've also improved the directory crawling logic, so that you don't have to use glob patterns or specify extensions when importing.
|
||||
|
||||
This means you can replace scripts that look like this::
|
||||
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" "/srv/funkwhale/data/music/**/*.mp3" --recursive --noinput
|
||||
|
||||
By this:
|
||||
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
|
||||
|
||||
And Funkwhale will happily import any supported audio file from the specified directory.
|
||||
|
|
|
@ -15,7 +15,7 @@ You can import those tracks as follows, assuming they are located in
|
|||
.. code-block:: bash
|
||||
|
||||
export LIBRARY_ID="<your_libary_id>"
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" --recursive --noinput
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
|
||||
|
||||
When you use docker, the ``/srv/funkwhale/data/music`` is mounted from the host
|
||||
to the ``/music`` directory on the container:
|
||||
|
@ -23,14 +23,14 @@ to the ``/music`` directory on the container:
|
|||
.. code-block:: bash
|
||||
|
||||
export LIBRARY_ID="<your_libary_id>"
|
||||
docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/**/*.ogg" --recursive --noinput
|
||||
docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/" --recursive --noinput
|
||||
|
||||
When you installed Funkwhale via ansible, you need to call a script instead of Python, and the folder path must be adapted accordingly:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export LIBRARY_ID="<your_libary_id>"
|
||||
/srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/**/*.ogg" --recursive --noinput
|
||||
/srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
|
||||
|
||||
.. note::
|
||||
You'll have to create a library in the Web UI before to get your library ID. Simply visit
|
||||
|
@ -107,9 +107,9 @@ you can create a symlink like this::
|
|||
ln -s /media/mynfsshare /srv/funkwhale/data/music/nfsshare
|
||||
|
||||
And import music from this share with this command::
|
||||
|
||||
|
||||
export LIBRARY_ID="<your_libary_id>"
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/**/*.ogg" --recursive --noinput --in-place
|
||||
python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/" --recursive --noinput --in-place
|
||||
|
||||
On docker setups, it will require a bit more work, because while the ``/srv/funkwhale/data/music`` is mounted
|
||||
in containers, symlinked directories are not.
|
||||
|
|
Ładowanie…
Reference in New Issue