diff --git a/test/test_utils.py b/test/test_utils.py index 816cf03f6..4c8182d82 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -156,6 +156,11 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes no', sanitize_filename('yes? no', is_id=False)) self.assertEqual('this - that', sanitize_filename('this: that', is_id=False)) + self.assertEqual('abc_<>\\*|de', sanitize_filename('abc/<>\\*|de', keep_bad_win_chars=True, is_id=False)) + self.assertEqual('xxx_<>\\*|', sanitize_filename('xxx/<>\\*|', keep_bad_win_chars=True, is_id=False)) + self.assertEqual('yes? no', sanitize_filename('yes? no', keep_bad_win_chars=True, is_id=False)) + self.assertEqual('this: that', sanitize_filename('this: that', keep_bad_win_chars=True, is_id=False)) + self.assertEqual(sanitize_filename('AT&T'), 'AT&T') aumlaut = 'รค' self.assertEqual(sanitize_filename(aumlaut), aumlaut) @@ -166,6 +171,10 @@ class TestUtil(unittest.TestCase): sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') + self.assertEqual( + sanitize_filename('New World record at 0:12:34', keep_bad_win_chars=True), + 'New World record at 0:12:34') + self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf') @@ -217,6 +226,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') + self.assertEqual(sanitize_filename('_n_cd26wFpw', keep_bad_win_chars=True, is_id=True), '_n_cd26wFpw') + self.assertEqual(sanitize_filename('_BD_eEpuzXw', keep_bad_win_chars=True, is_id=True), '_BD_eEpuzXw') + self.assertEqual(sanitize_filename('N0Y__7-UOdI', keep_bad_win_chars=True, is_id=True), 'N0Y__7-UOdI') + def test_sanitize_path(self): if sys.platform != 'win32': return diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e0d58f0f4..b6d25c017 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1293,10 +1293,12 @@ class YoutubeDL: na = self.params.get('outtmpl_na_placeholder', 'NA') def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): - return sanitize_filename(str(value), restricted=restricted, is_id=( - bool(re.search(r'(^|[_.])id(\.|$)', key)) - if 'filename-sanitization' in self.params['compat_opts'] - else NO_DEFAULT)) + return sanitize_filename( + str(value), self.params.get('keep_bad_win_chars', False), restricted=restricted, + is_id=( + bool(re.search(r'(^|[_.])id(\.|$)', key)) + if 'filename-sanitization' in self.params['compat_opts'] + else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer sanitize = bool(sanitize) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3d606bcba..f8ee68eae 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -815,6 +815,7 @@ def parse_options(argv=None): 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, 'windowsfilenames': opts.windowsfilenames, + 'keep_bad_win_chars': opts.keep_bad_win_chars, 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, 'allowed_extractors': opts.allowed_extractors or ['default'], diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bebbc6b43..139f587f9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1011,7 +1011,7 @@ class InfoExtractor: if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:trim_length - len(h)] + h - filename = sanitize_filename(f'{basen}.dump', restricted=True) + filename = sanitize_filename(f'{basen}.dump', self.get_param('keep_bad_win_chars', False), restricted=True) # Working around MAX_PATH limitation on Windows (see # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) if compat_os_name == 'nt': diff --git a/yt_dlp/options.py b/yt_dlp/options.py index faa1ee563..5863085f6 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -267,6 +267,10 @@ def create_parser(): out_dict[key] = out_dict.get(key, []) + [val] if append else val setattr(parser.values, option.dest, out_dict) + def _store_multiple_callback(option, opt_str, value, parser, values): + for key, value in values.items(): + setattr(parser.values, key, value) + def when_prefix(default): return { 'default': {}, @@ -1360,7 +1364,13 @@ def create_parser(): help='Force filenames to be Windows-compatible') filesystem.add_option( '--no-windows-filenames', - action='store_false', dest='windowsfilenames', + action='callback', dest='keep_bad_win_chars', default=False, callback=_store_multiple_callback, + callback_kwargs={ + 'values': { + 'windowsfilenames': False, + 'keep_bad_win_chars': True + } + }, help='Make filenames Windows-compatible only if using Windows (default)') filesystem.add_option( '--trim-filenames', '--trim-file-names', metavar='LENGTH', diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b63766912..302623b30 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -609,11 +609,12 @@ def timeconvert(timestr): return timestamp -def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): +def sanitize_filename(s, keep_bad_win_chars=False, restricted=False, is_id=NO_DEFAULT): """Sanitizes a string so it could be used as part of a filename. - @param restricted Use a stricter subset of allowed characters - @param is_id Whether this is an ID that should be kept unchanged if possible. - If unset, yt-dlp's new sanitization rules are in effect + @param keep_bad_win_chars Whether to keep characters invalid on Windows + @param restricted Use a stricter subset of allowed characters + @param is_id Whether this is an ID that should be kept unchanged if possible. + If unset, yt-dlp's new sanitization rules are in effect """ if s == '': return '' @@ -623,16 +624,16 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): return ACCENT_CHARS[char] elif not restricted and char == '\n': return '\0 ' - elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\': + elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\' and not keep_bad_win_chars: # Replace with their full-width unicode counterparts return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0)) - elif char == '?' or ord(char) < 32 or ord(char) == 127: + elif (not keep_bad_win_chars and char == '?') or ord(char) < 32 or ord(char) == 127: return '' - elif char == '"': + elif not keep_bad_win_chars and char == '"': return '' if restricted else '\'' - elif char == ':': + elif not keep_bad_win_chars and char == ':': return '\0_\0-' if restricted else '\0 \0-' - elif char in '\\/|*<>': + elif (not keep_bad_win_chars and char in '\\|*<>') or char == '/': return '\0_' if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): return '' if unicodedata.category(char)[0] in 'CM' else '\0_' @@ -641,7 +642,8 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): # Replace look-alike Unicode glyphs if restricted and (is_id is NO_DEFAULT or not is_id): s = unicodedata.normalize('NFKC', s) - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0) if keep_bad_win_chars + else m.group(0).replace(':', '_'), s) # Handle timestamps result = ''.join(map(replace_insane, s)) if is_id is NO_DEFAULT: result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars