funkwhale/api/funkwhale_api/typesense/utils.py

92 wiersze
2.9 KiB
Python

import logging
import re
import unidecode
from django.conf import settings
from django.core.cache import cache
from lb_matching_tools.cleaner import MetadataCleaner
from funkwhale_api.music import models as music_models
logger = logging.getLogger(__name__)
api_key = settings.TYPESENSE_API_KEY
host = settings.TYPESENSE_HOST
port = settings.TYPESENSE_PORT
protocol = settings.TYPESENSE_PROTOCOL
TYPESENSE_NUM_TYPO = settings.TYPESENSE_NUM_TYPO
class TypesenseNotActivate(Exception):
pass
if not settings.TYPESENSE_API_KEY:
logger.info(
"Typesense is not activated. You can enable it by setting the TYPESENSE_API_KEY env variable."
)
else:
import typesense
def delete_non_alnum_characters(text):
return unidecode.unidecode(re.sub(r"[^\w]+", "", text).lower())
def resolve_recordings_to_fw_track(recordings):
"""
Tries to match a troi recording entity to a fw track using the typesense index.
For test purposes : if multiple fw tracks are returned, we log the information
but only keep the best result in db to avoid duplicates.
"""
if not settings.TYPESENSE_API_KEY:
raise TypesenseNotActivate(
"Typesense is not activated. You can enable it by setting the TYPESENSE_API_KEY env variable."
)
client = typesense.Client(
{
"api_key": api_key,
"nodes": [{"host": host, "port": port, "protocol": protocol}],
"connection_timeout_seconds": 2,
}
)
mc = MetadataCleaner()
for recording in recordings:
rec = mc.clean_recording(recording.name)
artist = mc.clean_artist(recording.artist.name)
canonical_name_for_track = delete_non_alnum_characters(artist + rec)
logger.debug(f"Trying to resolve : {canonical_name_for_track}")
search_parameters = {
"q": canonical_name_for_track,
"query_by": "combined",
"num_typos": TYPESENSE_NUM_TYPO,
"drop_tokens_threshold": 0,
}
matches = client.collections["canonical_fw_data"].documents.search(
search_parameters
)
if matches["hits"]:
hit = matches["hits"][0]
pk = hit["document"]["pk"]
logger.debug(f"Saving match for track with primary key {pk}")
cache.set(recording.mbid, pk)
if settings.DEBUG and matches["hits"][1]:
for hit in matches["hits"][1:]:
pk = hit["document"]["pk"]
fw_track = music_models.Track.objects.get(pk=pk)
logger.info(
f"Duplicate match found for {fw_track.artist.name} {fw_track.title} \
and primary key {pk}. Skipping because of better match."
)
else:
logger.debug("No match found in fw db")
return cache.get_many([rec.mbid for rec in recordings])