From 00f707a4389403ac9f0a0769d3afd8c71f4966c6 Mon Sep 17 00:00:00 2001
From: Matt Westcott <matt@west.co.tt>
Date: Tue, 14 Jul 2015 16:14:45 +0100
Subject: [PATCH] Ensure that tabs in non-Latin languages are given non-blank
 IDs - fixes #1428

---
 .../edit_handlers/tabbed_interface.html       |  5 ++-
 .../templatetags/wagtailadmin_tags.py         |  8 ++++
 wagtail/wagtailcore/tests/test_utils.py       | 38 +++++++++++++++++++
 wagtail/wagtailcore/utils.py                  | 38 +++++++++++++++++++
 4 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 wagtail/wagtailcore/tests/test_utils.py
diff --git a/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html b/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html
index b70c1d47af..761f04c0ee 100644
--- a/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html
+++ b/wagtail/wagtailadmin/templates/wagtailadmin/edit_handlers/tabbed_interface.html
@@ -1,12 +1,13 @@
+{% load wagtailadmin_tags %}
 <ul class="tab-nav merged">
     {% for child in self.children %}
-        <li class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}"><a href="#{{ child.heading|slugify }}" class="{% if forloop.first %}active{% endif %}">{{ child.heading }}</a></li>
+        <li class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}"><a href="#{{ child.heading|cautious_slugify }}" class="{% if forloop.first %}active{% endif %}">{{ child.heading }}</a></li>
     {% endfor %}
 </ul>
 
 <div class="tab-content">
     {% for child in self.children %}
-        <section id="{{ child.heading|slugify }}" class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}">
+        <section id="{{ child.heading|cautious_slugify }}" class="{{ child.classes|join:" " }} {% if forloop.first %}active{% endif %}">
             {{ child.render_as_object }}
         </section>
     {% endfor %}
diff --git a/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py b/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py
index 3a4a21d41b..f66d77bc19 100644
--- a/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py
+++ b/wagtail/wagtailadmin/templatetags/wagtailadmin_tags.py
@@ -3,10 +3,12 @@ from __future__ import unicode_literals
 from django.conf import settings
 from django import template
 from django.contrib.humanize.templatetags.humanize import intcomma
+from django.template.defaultfilters import stringfilter
 
 from wagtail.wagtailcore import hooks
 from wagtail.wagtailcore.models import get_navigation_menu_items, UserPagePermissionsProxy, PageViewRestriction
 from wagtail.wagtailcore.utils import camelcase_to_underscore, escape_script
+from wagtail.wagtailcore.utils import cautious_slugify as _cautious_slugify
 from wagtail.wagtailadmin.menu import admin_menu
 
 
@@ -183,3 +185,9 @@ def has_unrendered_errors(bound_field):
     the widget does not support the render_with_errors method
     """
     return bound_field.errors and not hasattr(bound_field.field.widget, 'render_with_errors')
+
+
+@register.filter(is_safe=True)
+@stringfilter
+def cautious_slugify(value):
+    return _cautious_slugify(value)
diff --git a/wagtail/wagtailcore/tests/test_utils.py b/wagtail/wagtailcore/tests/test_utils.py
new file mode 100644
index 0000000000..581f3271a9
--- /dev/null
+++ b/wagtail/wagtailcore/tests/test_utils.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*
+from __future__ import unicode_literals
+
+from django.test import TestCase
+
+from django.utils.text import slugify
+from wagtail.wagtailcore.utils import cautious_slugify
+
+
+class TestCautiousSlugify(TestCase):
+
+    def test_behaves_same_as_slugify_for_latin_chars(self):
+        test_cases = [
+            ('', ''),
+            ('???', ''),
+            ('Hello world', 'hello-world'),
+            ('Hello_world', 'hello_world'),
+            ('Hellö wörld', 'hello-world'),
+            ('Hello   world', 'hello-world'),
+            ('   Hello world   ', 'hello-world'),
+            ('Hello, world!', 'hello-world'),
+            ('Hello*world', 'helloworld'),
+            ('Hello☃world', 'helloworld'),
+        ]
+
+        for (original, expected_result) in test_cases:
+            self.assertEqual(slugify(original), expected_result)
+            self.assertEqual(cautious_slugify(original), expected_result)
+
+    def test_escapes_non_latin_chars(self):
+        test_cases = [
+            ('Straßenbahn', 'straxdfenbahn'),
+            ('Спорт!', 'u0421u043fu043eu0440u0442'),
+            ('〔山脈〕', 'u5c71u8108'),
+        ]
+
+        for (original, expected_result) in test_cases:
+            self.assertEqual(cautious_slugify(original), expected_result)
diff --git a/wagtail/wagtailcore/utils.py b/wagtail/wagtailcore/utils.py
index e8ad697817..5b848705f5 100644
--- a/wagtail/wagtailcore/utils.py
+++ b/wagtail/wagtailcore/utils.py
@@ -1,7 +1,10 @@
 import re
+import unicodedata
 
 from django.db.models import Model
 from django.apps import apps
+from django.utils.encoding import force_text
+from django.utils.text import slugify
 from django.utils.six import string_types
 
 
@@ -45,3 +48,38 @@ def escape_script(text):
     `<-/script>`, `<--/script>` etc.
     """
     return SCRIPT_RE.sub(r'<-\1/script>', text)
+
+
+SLUGIFY_RE = re.compile(r'[^\w\s-]', re.UNICODE)
+
+
+def cautious_slugify(value):
+    """
+    Convert a string to ASCII exactly as Django's slugify does, with the exception
+    that any non-ASCII alphanumeric characters (that cannot be ASCIIfied under Unicode
+    normalisation) are escaped into codes like 'u0421' instead of being deleted entirely.
+
+    This ensures that the result of slugifying e.g. Cyrillic text will not be an empty
+    string, and can thus be safely used as an identifier (albeit not a human-readable one).
+    """
+    value = force_text(value)
+
+    # Normalize the string to decomposed unicode form. This causes accented Latin
+    # characters to be split into 'base character' + 'accent modifier'; the latter will
+    # be stripped out by the regexp, resulting in an ASCII-clean character that doesn't
+    # need to be escaped
+    value = unicodedata.normalize('NFKD', value)
+
+    # Strip out characters that aren't letterlike, underscores or hyphens,
+    # using the same regexp that slugify uses. This ensures that non-ASCII non-letters
+    # (e.g. accent modifiers, fancy punctuation) get stripped rather than escaped
+    value = SLUGIFY_RE.sub('', value)
+
+    # Encode as ASCII, escaping non-ASCII characters with backslashreplace, then convert
+    # back to a unicode string (which is what slugify expects)
+    value = value.encode('ascii', 'backslashreplace').decode('ascii')
+
+    # Pass to slugify to perform final conversion (whitespace stripping, applying
+    # mark_safe); this will also strip out the backslashes from the 'backslashreplace'
+    # conversion
+    return slugify(value)