Improved CLI importer reliability and UX

2020-04-15 11:23:50 +02:00 · 2020-04-15 11:23:50 +02:00 · ecb7c464a1
commit ecb7c464a1
--- a/api/funkwhale_api/music/management/commands/check_inplace_files.py
+++ b/api/funkwhale_api/music/management/commands/check_inplace_files.py
@ -47,7 +47,7 @@ class Command(BaseCommand):
        self.stdout.write("Checking {} in-place imported files…".format(total))

        missing = []
-        for i, row in enumerate(candidates.values("id", "source")):
+        for i, row in enumerate(candidates.values("id", "source").iterator()):
            path = row["source"].replace("file://", "")
            progress(self.stdout, i + 1, total)
            if not os.path.exists(path):
--- a/api/funkwhale_api/music/management/commands/import_files.py
+++ b/api/funkwhale_api/music/management/commands/import_files.py
@ -1,13 +1,40 @@
-import glob
+import itertools
 import os
 import urllib.parse
+import time

 from django.conf import settings
 from django.core.files import File
 from django.core.management.base import BaseCommand, CommandError
 from django.utils import timezone

-from funkwhale_api.music import models, tasks
+from funkwhale_api.music import models, tasks, utils
+
+
+def crawl_dir(dir, extensions, recursive=True):
+    if os.path.isfile(dir):
+        yield dir
+        return
+    with os.scandir(dir) as scanner:
+        for entry in scanner:
+            if entry.is_file():
+                for e in extensions:
+                    if entry.name.lower().endswith(".{}".format(e.lower())):
+                        yield entry.path
+            elif recursive and entry.is_dir():
+                yield from crawl_dir(entry, extensions, recursive=recursive)
+
+
+def batch(iterable, n=1):
+    has_entries = True
+    while has_entries:
+        current = []
+        for i in range(0, n):
+            try:
+                current.append(next(iterable))
+            except StopIteration:
+                has_entries = False
+        yield current


 class Command(BaseCommand):
@ -89,6 +116,7 @@ class Command(BaseCommand):
                "of overhead on your server and on servers you are federating with."
            ),
        )
+        parser.add_argument("-e", "--extension", nargs="+")

        parser.add_argument(
            "--broadcast",
@ -119,10 +147,17 @@ class Command(BaseCommand):
            help="Do NOT prompt the user for input of any kind.",
        )

-    def handle(self, *args, **options):
-        glob_kwargs = {}
-        matching = []
+        parser.add_argument(
+            "--batch-size",
+            "-s",
+            dest="batch_size",
+            default=1000,
+            type=int,
+            help="Size of each batch, only used when crawling large collections",
+        )

+    def handle(self, *args, **options):
+        self.is_confirmed = False
        try:
            library = models.Library.objects.select_related("actor__user").get(
                uuid__startswith=options["library_id"]
@ -133,14 +168,100 @@ class Command(BaseCommand):
        if not library.actor.get_user():
            raise CommandError("Library {} is not a local library".format(library.uuid))

-        if options["recursive"]:
-            glob_kwargs["recursive"] = True
-        for import_path in options["path"]:
-            matching += glob.glob(import_path, **glob_kwargs)
-        raw_matching = sorted(list(set(matching)))
+        if options["in_place"]:
+            self.stdout.write(
+                "Checking imported paths against settings.MUSIC_DIRECTORY_PATH"
+            )

+            for import_path in options["path"]:
+                p = settings.MUSIC_DIRECTORY_PATH
+                if not p:
+                    raise CommandError(
+                        "Importing in-place requires setting the "
+                        "MUSIC_DIRECTORY_PATH variable"
+                    )
+                if p and not import_path.startswith(p):
+                    raise CommandError(
+                        "Importing in-place only works if importing"
+                        "from {} (MUSIC_DIRECTORY_PATH), as this directory"
+                        "needs to be accessible by the webserver."
+                        "Culprit: {}".format(p, import_path)
+                    )
+
+        extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
+        crawler = itertools.chain(
+            *[
+                crawl_dir(p, extensions=extensions, recursive=options["recursive"])
+                for p in options["path"]
+            ]
+        )
+        errors = []
+        total = 0
+        start_time = time.time()
+        reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())
+
+        import_url = "{}://{}/content/libraries/{}/upload?{}"
+        import_url = import_url.format(
+            settings.FUNKWHALE_PROTOCOL,
+            settings.FUNKWHALE_HOSTNAME,
+            str(library.uuid),
+            urllib.parse.urlencode([("import", reference)]),
+        )
+        self.stdout.write(
+            "For details, please refer to import reference '{}' or URL {}".format(
+                reference, import_url
+            )
+        )
+        batch_start = None
+        batch_duration = None
+        for i, entries in enumerate(batch(crawler, options["batch_size"])):
+            total += len(entries)
+            batch_start = time.time()
+            time_stats = ""
+            if i > 0:
+                time_stats = " - running for {}s, previous batch took {}s".format(
+                    int(time.time() - start_time), int(batch_duration),
+                )
+            if entries:
+                self.stdout.write(
+                    "Handling batch {} ({} items){}".format(
+                        i + 1, options["batch_size"], time_stats,
+                    )
+                )
+                batch_errors = self.handle_batch(
+                    library=library,
+                    paths=entries,
+                    batch=i + 1,
+                    reference=reference,
+                    options=options,
+                )
+                if batch_errors:
+                    errors += batch_errors
+
+            batch_duration = time.time() - batch_start
+
+        message = "Successfully imported {} tracks in {}s"
+        if options["async_"]:
+            message = "Successfully launched import for {} tracks in {}s"
+
+        self.stdout.write(
+            message.format(total - len(errors), int(time.time() - start_time))
+        )
+        if len(errors) > 0:
+            self.stderr.write("{} tracks could not be imported:".format(len(errors)))
+
+            for path, error in errors:
+                self.stderr.write("- {}: {}".format(path, error))
+
+        self.stdout.write(
+            "For details, please refer to import reference '{}' or URL {}".format(
+                reference, import_url
+            )
+        )
+
+    def handle_batch(self, library, paths, batch, reference, options):
        matching = []
-        for m in raw_matching:
+        for m in paths:
            # In some situations, the path is encoded incorrectly on the filesystem
            # so we filter out faulty paths and display a warning to the user.
            # see https://dev.funkwhale.audio/funkwhale/funkwhale/issues/138
@ -160,96 +281,57 @@ class Command(BaseCommand):
                    )
                )

-        if options["in_place"]:
-            self.stdout.write(
-                "Checking imported paths against settings.MUSIC_DIRECTORY_PATH"
-            )
-            p = settings.MUSIC_DIRECTORY_PATH
-            if not p:
-                raise CommandError(
-                    "Importing in-place requires setting the "
-                    "MUSIC_DIRECTORY_PATH variable"
-                )
-            for m in matching:
-                if not m.startswith(p):
-                    raise CommandError(
-                        "Importing in-place only works if importing"
-                        "from {} (MUSIC_DIRECTORY_PATH), as this directory"
-                        "needs to be accessible by the webserver."
-                        "Culprit: {}".format(p, m)
-                    )
        if not matching:
            raise CommandError("No file matching pattern, aborting")

        if options["replace"]:
            filtered = {"initial": matching, "skipped": [], "new": matching}
-            message = "- {} files to be replaced"
+            message = "  - {} files to be replaced"
            import_paths = matching
        else:
            filtered = self.filter_matching(matching, library)
-            message = "- {} files already found in database"
+            message = "  - {} files already found in database"
            import_paths = filtered["new"]

-        self.stdout.write("Import summary:")
+        self.stdout.write("  Import summary:")
        self.stdout.write(
-            "- {} files found matching this pattern: {}".format(
+            "  - {} files found matching this pattern: {}".format(
                len(matching), options["path"]
            )
        )
        self.stdout.write(message.format(len(filtered["skipped"])))

-        self.stdout.write("- {} new files".format(len(filtered["new"])))
+        self.stdout.write("  - {} new files".format(len(filtered["new"])))

-        self.stdout.write(
-            "Selected options: {}".format(
-                ", ".join(["in place" if options["in_place"] else "copy music files"])
+        if batch == 1:
+            self.stdout.write(
+                "  Selected options: {}".format(
+                    ", ".join(
+                        ["in place" if options["in_place"] else "copy music files"]
+                    )
+                )
            )
-        )
        if len(filtered["new"]) == 0:
-            self.stdout.write("Nothing new to import, exiting")
+            self.stdout.write("  Nothing new to import, exiting")
            return

-        if options["interactive"]:
+        if options["interactive"] and not self.is_confirmed:
            message = (
                "Are you sure you want to do this?\n\n"
                "Type 'yes' to continue, or 'no' to cancel: "
            )
            if input("".join(message)) != "yes":
                raise CommandError("Import cancelled.")
-        reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())
-
-        import_url = "{}://{}/content/libraries/{}/upload?{}"
-        import_url = import_url.format(
-            settings.FUNKWHALE_PROTOCOL,
-            settings.FUNKWHALE_HOSTNAME,
-            str(library.uuid),
-            urllib.parse.urlencode([("import", reference)]),
-        )
-        self.stdout.write(
-            "For details, please refer to import reference '{}' or URL {}".format(
-                reference, import_url
-            )
-        )
+            self.is_confirmed = True

        errors = self.do_import(
-            import_paths, library=library, reference=reference, options=options
-        )
-        message = "Successfully imported {} tracks"
-        if options["async_"]:
-            message = "Successfully launched import for {} tracks"
-
-        self.stdout.write(message.format(len(import_paths)))
-        if len(errors) > 0:
-            self.stderr.write("{} tracks could not be imported:".format(len(errors)))
-
-            for path, error in errors:
-                self.stderr.write("- {}: {}".format(path, error))
-
-        self.stdout.write(
-            "For details, please refer to import reference '{}' or URL {}".format(
-                reference, import_url
-            )
+            import_paths,
+            library=library,
+            reference=reference,
+            batch=batch,
+            options=options,
        )
+        return errors

    def filter_matching(self, matching, library):
        sources = ["file://{}".format(p) for p in matching]
@ -266,17 +348,20 @@ class Command(BaseCommand):
        }
        return result

-    def do_import(self, paths, library, reference, options):
-        message = "{i}/{total} Importing {path}..."
+    def do_import(self, paths, library, reference, batch, options):
+        message = "[batch {batch}] {i}/{total} Importing {path}..."
        if options["async_"]:
-            message = "{i}/{total} Launching import for {path}..."
+            message = "[batch {batch}] {i}/{total} Launching import for {path}..."

        # we create an upload binded to the library
        async_ = options["async_"]
        errors = []
        for i, path in list(enumerate(paths)):
+            if options["verbosity"] > 1:
+                self.stdout.write(
+                    message.format(batch=batch, path=path, i=i + 1, total=len(paths))
+                )
            try:
-                self.stdout.write(message.format(path=path, i=i + 1, total=len(paths)))
                self.create_upload(
                    path,
                    reference,
--- a/changes/changelog.d/importer.enhancement
+++ b/changes/changelog.d/importer.enhancement
@ -0,0 +1 @@
+CLI Importer is now more reliable and less resource-hungry on large libraries
--- a/changes/notes.rst
+++ b/changes/notes.rst
@ -5,3 +5,20 @@ Next release notes

    Those release notes refer to the current development branch and are reset
    after each release.
+
+More reliable CLI importer
+--------------------------
+
+Our CLI importer is now more reliable and less prone to Out-of-Memory issues, especially when scanning large libraries. (hundreds of GB or bigger)
+
+We've also improved the directory crawling logic, so that you don't have to use glob patterns or specify extensions when importing.
+
+This means you can replace scripts that look like this::
+
+    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" "/srv/funkwhale/data/music/**/*.mp3" --recursive --noinput
+
+By this:
+
+    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput
+
+And Funkwhale will happily import any supported audio file from the specified directory.
--- a/docs/admin/importing-music.rst
+++ b/docs/admin/importing-music.rst
@ -15,7 +15,7 @@ You can import those tracks as follows, assuming they are located in
 .. code-block:: bash

    export LIBRARY_ID="<your_libary_id>"
-    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" --recursive --noinput
+    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput

 When you use docker, the ``/srv/funkwhale/data/music`` is mounted from the host
 to the ``/music`` directory on the container:
@ -23,14 +23,14 @@ to the ``/music`` directory on the container:
 .. code-block:: bash

    export LIBRARY_ID="<your_libary_id>"
-    docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/**/*.ogg" --recursive --noinput
+    docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/" --recursive --noinput

 When you installed Funkwhale via ansible, you need to call a script instead of Python, and the folder path must be adapted accordingly:

 .. code-block:: bash

    export LIBRARY_ID="<your_libary_id>"
-    /srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/**/*.ogg" --recursive --noinput
+    /srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput

 .. note::
    You'll have to create a library in the Web UI before to get your library ID. Simply visit
@ -107,9 +107,9 @@ you can create a symlink like this::
    ln -s /media/mynfsshare /srv/funkwhale/data/music/nfsshare

 And import music from this share with this command::
-    
+
    export LIBRARY_ID="<your_libary_id>"
-    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/**/*.ogg" --recursive --noinput --in-place
+    python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/" --recursive --noinput --in-place

 On docker setups, it will require a bit more work, because while the ``/srv/funkwhale/data/music`` is mounted
 in containers, symlinked directories are not.
				`@ -0,0 +1 @@`
				`CLI Importer is now more reliable and less resource-hungry on large libraries`