Ensure that tabs in non-Latin languages are given non-blank IDs - fixes #1428

2015-07-14 16:14:45 +01:00 · 2015-07-14 16:14:45 +01:00 · 00f707a438
commit 00f707a438
--- a/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html
+++ b/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html
@ -1,12 +1,13 @@
+{% load wagtailadmin_tags %}
 <ul class="tab-nav merged">
    {% for child in self.children %}
-        <li class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}"><a href="#{{ child.heading|slugify }}" class="{% if forloop.first %}active{% endif %}">{{ child.heading }}</a></li>
+        <li class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}"><a href="#{{ child.heading|cautious_slugify }}" class="{% if forloop.first %}active{% endif %}">{{ child.heading }}</a></li>
    {% endfor %}
 </ul>

 <div class="tab-content">
    {% for child in self.children %}
-        <section id="{{ child.heading|slugify }}" class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}">
+        <section id="{{ child.heading|cautious_slugify }}" class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}">
            {{ child.render_as_object }}
        </section>
    {% endfor %}
--- a/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py
+++ b/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py
@ -3,10 +3,12 @@ from __future__ import unicode_literals
 from django.conf import settings
 from django import template
 from django.contrib.humanize.templatetags.humanize import intcomma
+from django.template.defaultfilters import stringfilter

 from wagtail.wagtailcore import hooks
 from wagtail.wagtailcore.models import get_navigation_menu_items, UserPagePermissionsProxy, PageViewRestriction
 from wagtail.wagtailcore.utils import camelcase_to_underscore, escape_script
+from wagtail.wagtailcore.utils import cautious_slugify as _cautious_slugify
 from wagtail.wagtailadmin.menu import admin_menu


@ -183,3 +185,9 @@ def has_unrendered_errors(bound_field):
    the widget does not support the render_with_errors method
    """
    return bound_field.errors and not hasattr(bound_field.field.widget, 'render_with_errors')
+
+
+@register.filter(is_safe=True)
+@stringfilter
+def cautious_slugify(value):
+    return _cautious_slugify(value)
--- a/wagtail/wagtailcore/tests/test_utils.py
+++ b/wagtail/wagtailcore/tests/test_utils.py
@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*
+from __future__ import unicode_literals
+
+from django.test import TestCase
+
+from django.utils.text import slugify
+from wagtail.wagtailcore.utils import cautious_slugify
+
+
+class TestCautiousSlugify(TestCase):
+
+    def test_behaves_same_as_slugify_for_latin_chars(self):
+        test_cases = [
+            ('', ''),
+            ('???', ''),
+            ('Hello world', 'hello-world'),
+            ('Hello_world', 'hello_world'),
+            ('Hellö wörld', 'hello-world'),
+            ('Hello   world', 'hello-world'),
+            ('   Hello world   ', 'hello-world'),
+            ('Hello, world!', 'hello-world'),
+            ('Hello*world', 'helloworld'),
+            ('Hello☃world', 'helloworld'),
+        ]
+
+        for (original, expected_result) in test_cases:
+            self.assertEqual(slugify(original), expected_result)
+            self.assertEqual(cautious_slugify(original), expected_result)
+
+    def test_escapes_non_latin_chars(self):
+        test_cases = [
+            ('Straßenbahn', 'straxdfenbahn'),
+            ('Спорт!', 'u0421u043fu043eu0440u0442'),
+            ('〔山脈〕', 'u5c71u8108'),
+        ]
+
+        for (original, expected_result) in test_cases:
+            self.assertEqual(cautious_slugify(original), expected_result)
--- a/wagtail/wagtailcore/utils.py
+++ b/wagtail/wagtailcore/utils.py
@ -1,7 +1,10 @@
 import re
+import unicodedata

 from django.db.models import Model
 from django.apps import apps
+from django.utils.encoding import force_text
+from django.utils.text import slugify
 from django.utils.six import string_types


@ -45,3 +48,38 @@ def escape_script(text):
    `<-/script>`, `<--/script>` etc.
    """
    return SCRIPT_RE.sub(r'<-\1/script>', text)
+
+
+SLUGIFY_RE = re.compile(r'[^\w\s-]', re.UNICODE)
+
+
+def cautious_slugify(value):
+    """
+    Convert a string to ASCII exactly as Django's slugify does, with the exception
+    that any non-ASCII alphanumeric characters (that cannot be ASCIIfied under Unicode
+    normalisation) are escaped into codes like 'u0421' instead of being deleted entirely.
+
+    This ensures that the result of slugifying e.g. Cyrillic text will not be an empty
+    string, and can thus be safely used as an identifier (albeit not a human-readable one).
+    """
+    value = force_text(value)
+
+    # Normalize the string to decomposed unicode form. This causes accented Latin
+    # characters to be split into 'base character' + 'accent modifier'; the latter will
+    # be stripped out by the regexp, resulting in an ASCII-clean character that doesn't
+    # need to be escaped
+    value = unicodedata.normalize('NFKD', value)
+
+    # Strip out characters that aren't letterlike, underscores or hyphens,
+    # using the same regexp that slugify uses. This ensures that non-ASCII non-letters
+    # (e.g. accent modifiers, fancy punctuation) get stripped rather than escaped
+    value = SLUGIFY_RE.sub('', value)
+
+    # Encode as ASCII, escaping non-ASCII characters with backslashreplace, then convert
+    # back to a unicode string (which is what slugify expects)
+    value = value.encode('ascii', 'backslashreplace').decode('ascii')
+
+    # Pass to slugify to perform final conversion (whitespace stripping, applying
+    # mark_safe); this will also strip out the backslashes from the 'backslashreplace'
+    # conversion
+    return slugify(value)