Signal-Android/app/src/main/java/org/thoughtcrime/securesms/util/StringUtil.java

package org.thoughtcrime.securesms.util;

import android.text.TextUtils;

import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import androidx.core.text.BidiFormatter;

import org.signal.core.util.BreakIteratorCompat;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Set;

public final class StringUtil {

  private static final Set<Character> WHITESPACE = SetUtil.newHashSet('\u200E',  // left-to-right mark
                                                                      '\u200F',  // right-to-left mark
                                                                      '\u2007'); // figure space

  private static final class Bidi {
    /** Override text direction  */
    private static final Set<Integer> OVERRIDES = SetUtil.newHashSet("\u202a".codePointAt(0), /* LRE */
                                                                     "\u202b".codePointAt(0), /* RLE */
                                                                     "\u202d".codePointAt(0), /* LRO */
                                                                     "\u202e".codePointAt(0)  /* RLO */);

    /** Set direction and isolate surrounding text */
    private static final Set<Integer> ISOLATES = SetUtil.newHashSet("\u2066".codePointAt(0), /* LRI */
                                                                    "\u2067".codePointAt(0), /* RLI */
                                                                    "\u2068".codePointAt(0)  /* FSI */);
    /** Closes things in {@link #OVERRIDES} */
    private static final int PDF = "\u202c".codePointAt(0);

    /** Closes things in {@link #ISOLATES} */
    private static final int PDI = "\u2069".codePointAt(0);

    /** Auto-detecting isolate */
    private static final int FSI = "\u2068".codePointAt(0);
  }

  private StringUtil() {
  }

  /**
   * Trims a name string to fit into the byte length requirement.
   * <p>
   * This method treats a surrogate pair and a grapheme cluster a single character
   * See examples in tests defined in StringUtilText_trimToFit.
   */
  public static @NonNull String trimToFit(@Nullable String name, int maxByteLength) {
    if (TextUtils.isEmpty(name)) {
      return "";
    }

    if (name.getBytes(StandardCharsets.UTF_8).length <= maxByteLength) {
      return name;
    }

    try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) {
      for (String graphemeCharacter : new CharacterIterable(name)) {
        byte[] bytes = graphemeCharacter.getBytes(StandardCharsets.UTF_8);

        if (stream.size() + bytes.length <= maxByteLength) {
          stream.write(bytes);
        } else {
          break;
        }
      }
      return stream.toString();
    } catch (IOException e) {
      throw new AssertionError(e);
    }
  }

  /**
   * @return A charsequence with no leading or trailing whitespace. Only creates a new charsequence
   *         if it has to.
   */
  public static @NonNull CharSequence trim(@NonNull CharSequence charSequence) {
    if (charSequence.length() == 0) {
      return charSequence;
    }

    int start = 0;
    int end   = charSequence.length() - 1;

    while (start < charSequence.length() && Character.isWhitespace(charSequence.charAt(start))) {
      start++;
    }

    while (end >= 0 && end > start && Character.isWhitespace(charSequence.charAt(end))) {
      end--;
    }

    if (start > 0 || end < charSequence.length() - 1) {
      return charSequence.subSequence(start, end + 1);
    } else {
      return charSequence;
    }
  }

  /**
   * @return True if the string is empty, or if it contains nothing but whitespace characters.
   *         Accounts for various unicode whitespace characters.
   */
  public static boolean isVisuallyEmpty(@Nullable String value) {
    if (value == null || value.length() == 0) {
      return true;
    }

    return indexOfFirstNonEmptyChar(value) == -1;
  }

  /**
   * @return String without any leading or trailing whitespace.
   *         Accounts for various unicode whitespace characters.
   */
  public static String trimToVisualBounds(@NonNull String value) {
    int start = indexOfFirstNonEmptyChar(value);

    if (start == -1) {
      return "";
    }

    int end = indexOfLastNonEmptyChar(value);

    return value.substring(start, end + 1);
  }

  private static int indexOfFirstNonEmptyChar(@NonNull String value) {
    int length = value.length();

    for (int i = 0; i < length; i++) {
      if (!isVisuallyEmpty(value.charAt(i))) {
        return i;
      }
    }

    return -1;
  }

  private static int indexOfLastNonEmptyChar(@NonNull String value) {
    for (int i = value.length() - 1; i >= 0; i--) {
      if (!isVisuallyEmpty(value.charAt(i))) {
        return i;
      }
    }
    return -1;
  }

  /**
   * @return True if the character is invisible or whitespace. Accounts for various unicode
   *         whitespace characters.
   */
  public static boolean isVisuallyEmpty(char c) {
    return Character.isWhitespace(c) || WHITESPACE.contains(c);
  }

  /**
   * @return A string representation of the provided unicode code point.
   */
  public static @NonNull String codePointToString(int codePoint) {
    return new String(Character.toChars(codePoint));
  }

  /**
   * Isolates bi-directional text from influencing surrounding text. You should use this whenever
   * you're injecting user-generated text into a larger string.
   *
   * You'd think we'd be able to trust {@link BidiFormatter}, but unfortunately it just misses some
   * corner cases, so here we are.
   *
   * The general idea is just to balance out the opening and closing codepoints, and then wrap the
   * whole thing in FSI/PDI to isolate it.
   *
   * For more details, see:
   * https://www.w3.org/International/questions/qa-bidi-unicode-controls
   */
  public static @NonNull String isolateBidi(@Nullable String text) {
    if (text == null) {
      return "";
    }

    if (Util.isEmpty(text)) {
      return text;
    }

    int overrideCount      = 0;
    int overrideCloseCount = 0;
    int isolateCount       = 0;
    int isolateCloseCount  = 0;

    for (int i = 0, len = text.codePointCount(0, text.length()); i < len; i++) {
      int codePoint = text.codePointAt(i);

      if (Bidi.OVERRIDES.contains(codePoint)) {
        overrideCount++;
      } else if (codePoint == Bidi.PDF) {
        overrideCloseCount++;
      } else if (Bidi.ISOLATES.contains(codePoint)) {
        isolateCount++;
      } else if (codePoint == Bidi.PDI) {
        isolateCloseCount++;
      }
    }

    StringBuilder suffix = new StringBuilder();

    while (overrideCount > overrideCloseCount) {
      suffix.appendCodePoint(Bidi.PDF);
      overrideCloseCount++;
    }

    while (isolateCount > isolateCloseCount) {
      suffix.appendCodePoint(Bidi.FSI);
      isolateCloseCount++;
    }

    StringBuilder out = new StringBuilder();

    return out.appendCodePoint(Bidi.FSI)
              .append(text)
              .append(suffix)
              .appendCodePoint(Bidi.PDI)
              .toString();
  }

  public static @Nullable String stripBidiProtection(@Nullable String text) {
    if (text == null) return null;

    return text.replaceAll("[\\u2068\\u2069\\u202c]", "");
  }

  /**
   * Trims a {@link CharSequence} of starting and trailing whitespace. Behavior matches
   * {@link String#trim()} to preserve expectations around results.
   */
  public static CharSequence trimSequence(CharSequence text) {
    int length     = text.length();
    int startIndex = 0;

    while ((startIndex < length) && (text.charAt(startIndex) <= ' ')) {
      startIndex++;
    }
    while ((startIndex < length) && (text.charAt(length - 1) <= ' ')) {
      length--;
    }
    return (startIndex > 0 || length < text.length()) ? text.subSequence(startIndex, length) : text;
  }

  /**
   * If the {@param text} exceeds the {@param maxChars} it is trimmed in the middle so that the result is exactly {@param maxChars} long including an added
   * ellipsis character.
   * <p>
   * Otherwise the string is returned untouched.
   * <p>
   * When {@param maxChars} is even, one more character is kept from the end of the string than the start.
   */
  public static @Nullable CharSequence abbreviateInMiddle(@Nullable CharSequence text, int maxChars) {
     if (text == null || text.length() <= maxChars) {
      return text;
    }

    int start = (maxChars - 1) / 2;
    int end   = (maxChars - 1) - start;
    return text.subSequence(0, start) + "…" + text.subSequence(text.length() - end, text.length());
  }

  /**
   * @return The number of graphemes in the provided string.
   */
  public static int getGraphemeCount(@NonNull CharSequence text) {
    BreakIteratorCompat iterator = BreakIteratorCompat.getInstance();
    iterator.setText(text);
    return iterator.countBreaks();
  }
}
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`package org.thoughtcrime.securesms.util;`

Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`import android.text.TextUtils;`

Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`import androidx.annotation.NonNull;`
			`import androidx.annotation.Nullable;`
Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00			`import androidx.core.text.BidiFormatter;`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00
Only allow emojis as reactions. 2021-04-09 20:44:47 +00:00			`import org.signal.core.util.BreakIteratorCompat;`

Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`import java.io.ByteArrayOutputStream;`
			`import java.io.IOException;`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`import java.nio.charset.StandardCharsets;`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00			`import java.util.Set;`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00
			`public final class StringUtil {`

Upgrade SDK to 30. 2020-11-17 13:58:14 +00:00			`private static final Set<Character> WHITESPACE = SetUtil.newHashSet('\u200E', // left-to-right mark`
			`'\u200F', // right-to-left mark`
			`'\u2007'); // figure space`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00
Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00			`private static final class Bidi {`
			`/** Override text direction */`
Upgrade SDK to 30. 2020-11-17 13:58:14 +00:00			`private static final Set<Integer> OVERRIDES = SetUtil.newHashSet("\u202a".codePointAt(0), /* LRE */`
			`"\u202b".codePointAt(0), /* RLE */`
			`"\u202d".codePointAt(0), /* LRO */`
			`"\u202e".codePointAt(0) /* RLO */);`
Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00
			`/** Set direction and isolate surrounding text */`
Upgrade SDK to 30. 2020-11-17 13:58:14 +00:00			`private static final Set<Integer> ISOLATES = SetUtil.newHashSet("\u2066".codePointAt(0), /* LRI */`
			`"\u2067".codePointAt(0), /* RLI */`
			`"\u2068".codePointAt(0) /* FSI */);`
Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00			`/** Closes things in {@link #OVERRIDES} */`
			`private static final int PDF = "\u202c".codePointAt(0);`

			`/** Closes things in {@link #ISOLATES} */`
			`private static final int PDI = "\u2069".codePointAt(0);`

			`/** Auto-detecting isolate */`
			`private static final int FSI = "\u2068".codePointAt(0);`
			`}`

Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`private StringUtil() {`
			`}`

			`/**`
			`* Trims a name string to fit into the byte length requirement.`
Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`* <p>`
			`* This method treats a surrogate pair and a grapheme cluster a single character`
			`* See examples in tests defined in StringUtilText_trimToFit.`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`*/`
Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`public static @NonNull String trimToFit(@Nullable String name, int maxByteLength) {`
			`if (TextUtils.isEmpty(name)) {`
			`return "";`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`}`

Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`if (name.getBytes(StandardCharsets.UTF_8).length <= maxByteLength) {`
			`return name;`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`}`

Account for grapheme cluster when trimming to fit a specific length. Fixes #10076 2020-10-25 22:13:29 +00:00			`try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) {`
			`for (String graphemeCharacter : new CharacterIterable(name)) {`
			`byte[] bytes = graphemeCharacter.getBytes(StandardCharsets.UTF_8);`

			`if (stream.size() + bytes.length <= maxByteLength) {`
			`stream.write(bytes);`
			`} else {`
			`break;`
			`}`
			`}`
			`return stream.toString();`
			`} catch (IOException e) {`
			`throw new AssertionError(e);`
			`}`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`}`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00
Trim message bodies at display time. 2020-11-12 17:18:20 +00:00			`/**`
			`* @return A charsequence with no leading or trailing whitespace. Only creates a new charsequence`
			`* if it has to.`
			`*/`
			`public static @NonNull CharSequence trim(@NonNull CharSequence charSequence) {`
			`if (charSequence.length() == 0) {`
			`return charSequence;`
			`}`

			`int start = 0;`
			`int end = charSequence.length() - 1;`

			`while (start < charSequence.length() && Character.isWhitespace(charSequence.charAt(start))) {`
			`start++;`
			`}`

			`while (end >= 0 && end > start && Character.isWhitespace(charSequence.charAt(end))) {`
			`end--;`
			`}`

			`if (start > 0 \|\| end < charSequence.length() - 1) {`
			`return charSequence.subSequence(start, end + 1);`
			`} else {`
			`return charSequence;`
			`}`
			`}`

Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00			`/**`
			`* @return True if the string is empty, or if it contains nothing but whitespace characters.`
			`* Accounts for various unicode whitespace characters.`
			`*/`
			`public static boolean isVisuallyEmpty(@Nullable String value) {`
			`if (value == null \|\| value.length() == 0) {`
			`return true;`
			`}`

Prevent leading and trailing whitespace in group names. 2020-07-23 15:25:37 +00:00			`return indexOfFirstNonEmptyChar(value) == -1;`
			`}`

			`/**`
			`* @return String without any leading or trailing whitespace.`
			`* Accounts for various unicode whitespace characters.`
			`*/`
			`public static String trimToVisualBounds(@NonNull String value) {`
			`int start = indexOfFirstNonEmptyChar(value);`

			`if (start == -1) {`
			`return "";`
			`}`

			`int end = indexOfLastNonEmptyChar(value);`

			`return value.substring(start, end + 1);`
			`}`

			`private static int indexOfFirstNonEmptyChar(@NonNull String value) {`
			`int length = value.length();`

			`for (int i = 0; i < length; i++) {`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00			`if (!isVisuallyEmpty(value.charAt(i))) {`
Prevent leading and trailing whitespace in group names. 2020-07-23 15:25:37 +00:00			`return i;`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00			`}`
			`}`

Prevent leading and trailing whitespace in group names. 2020-07-23 15:25:37 +00:00			`return -1;`
			`}`

			`private static int indexOfLastNonEmptyChar(@NonNull String value) {`
			`for (int i = value.length() - 1; i >= 0; i--) {`
			`if (!isVisuallyEmpty(value.charAt(i))) {`
			`return i;`
			`}`
			`}`
			`return -1;`
Disallow 'visually empty' profile names. 2020-07-01 19:30:39 +00:00			`}`

			`/**`
			`* @return True if the character is invisible or whitespace. Accounts for various unicode`
			`* whitespace characters.`
			`*/`
			`public static boolean isVisuallyEmpty(char c) {`
			`return Character.isWhitespace(c) \|\| WHITESPACE.contains(c);`
			`}`
Fix casing issues with non-ASCII characters in contact search. SQLite's case-related stuff is ASCII-only. That means that even though LIKE is supposed to be case-insensitive, it fails when used on non-ASCII characters. There appears to be no relief in SQLite itself, so I swapped our contact search to use GLOB instead of LIKE and wrote a little thing to convert query strings into a case-insensitive unicode-compatible patterns. Didn't see any noticeable performance difference. 2020-07-23 02:36:10 +00:00
			`/**`
			`* @return A string representation of the provided unicode code point.`
			`*/`
			`public static @NonNull String codePointToString(int codePoint) {`
			`return new String(Character.toChars(codePoint));`
			`}`
Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00
			`/**`
			`* Isolates bi-directional text from influencing surrounding text. You should use this whenever`
			`* you're injecting user-generated text into a larger string.`
			`*`
			`* You'd think we'd be able to trust {@link BidiFormatter}, but unfortunately it just misses some`
			`* corner cases, so here we are.`
			`*`
			`* The general idea is just to balance out the opening and closing codepoints, and then wrap the`
			`* whole thing in FSI/PDI to isolate it.`
			`*`
			`* For more details, see:`
			`* https://www.w3.org/International/questions/qa-bidi-unicode-controls`
			`*/`
Don't double-isolate-bidi on phone numbers. Fixes #10257 2020-12-17 16:53:58 +00:00			`public static @NonNull String isolateBidi(@Nullable String text) {`
			`if (text == null) {`
			`return "";`
			`}`

			`if (Util.isEmpty(text)) {`
Fix issue where group updates were mis-rendered. 2020-07-31 03:05:09 +00:00			`return text;`
			`}`

Improve handling of partially bi-directional text. 2020-07-29 04:55:20 +00:00			`int overrideCount = 0;`
			`int overrideCloseCount = 0;`
			`int isolateCount = 0;`
			`int isolateCloseCount = 0;`

			`for (int i = 0, len = text.codePointCount(0, text.length()); i < len; i++) {`
			`int codePoint = text.codePointAt(i);`

			`if (Bidi.OVERRIDES.contains(codePoint)) {`
			`overrideCount++;`
			`} else if (codePoint == Bidi.PDF) {`
			`overrideCloseCount++;`
			`} else if (Bidi.ISOLATES.contains(codePoint)) {`
			`isolateCount++;`
			`} else if (codePoint == Bidi.PDI) {`
			`isolateCloseCount++;`
			`}`
			`}`

			`StringBuilder suffix = new StringBuilder();`

			`while (overrideCount > overrideCloseCount) {`
			`suffix.appendCodePoint(Bidi.PDF);`
			`overrideCloseCount++;`
			`}`

			`while (isolateCount > isolateCloseCount) {`
			`suffix.appendCodePoint(Bidi.FSI);`
			`isolateCloseCount++;`
			`}`

			`StringBuilder out = new StringBuilder();`

			`return out.appendCodePoint(Bidi.FSI)`
			`.append(text)`
			`.append(suffix)`
			`.appendCodePoint(Bidi.PDI)`
			`.toString();`
			`}`
Add mentions for v2 group chats. 2020-08-05 20:45:52 +00:00
Fix false group name and avatar updates. 2020-09-25 17:46:38 +00:00			`public static @Nullable String stripBidiProtection(@Nullable String text) {`
			`if (text == null) return null;`

			`return text.replaceAll("[\\u2068\\u2069\\u202c]", "");`
			`}`

Add mentions for v2 group chats. 2020-08-05 20:45:52 +00:00			`/**`
			`* Trims a {@link CharSequence} of starting and trailing whitespace. Behavior matches`
			`* {@link String#trim()} to preserve expectations around results.`
			`*/`
			`public static CharSequence trimSequence(CharSequence text) {`
			`int length = text.length();`
			`int startIndex = 0;`

			`while ((startIndex < length) && (text.charAt(startIndex) <= ' ')) {`
			`startIndex++;`
			`}`
			`while ((startIndex < length) && (text.charAt(length - 1) <= ' ')) {`
			`length--;`
			`}`
			`return (startIndex > 0 \|\| length < text.length()) ? text.subSequence(startIndex, length) : text;`
			`}`
Payments. Co-authored-by: Alan Evans <alan@signal.org> Co-authored-by: Alex Hart <alex@signal.org> Co-authored-by: Cody Henthorne <cody@signal.org> 2021-04-06 16:03:33 +00:00
			`/**`
			`* If the {@param text} exceeds the {@param maxChars} it is trimmed in the middle so that the result is exactly {@param maxChars} long including an added`
			`* ellipsis character.`
			`* <p>`
			`* Otherwise the string is returned untouched.`
			`* <p>`
			`* When {@param maxChars} is even, one more character is kept from the end of the string than the start.`
			`*/`
			`public static @Nullable CharSequence abbreviateInMiddle(@Nullable CharSequence text, int maxChars) {`
			`if (text == null \|\| text.length() <= maxChars) {`
			`return text;`
			`}`

			`int start = (maxChars - 1) / 2;`
			`int end = (maxChars - 1) - start;`
			`return text.subSequence(0, start) + "…" + text.subSequence(text.length() - end, text.length());`
			`}`
Only allow emojis as reactions. 2021-04-09 20:44:47 +00:00
			`/**`
			`* @return The number of graphemes in the provided string.`
			`*/`
			`public static int getGraphemeCount(@NonNull CharSequence text) {`
			`BreakIteratorCompat iterator = BreakIteratorCompat.getInstance();`
			`iterator.setText(text);`
			`return iterator.countBreaks();`
			`}`
Add some polish to the groups V2 manager UI. 2020-05-07 13:39:40 +00:00			`}`