twitter-the-algorithm/tweetypie/common/src/scala/com/twitter/tweetypie/matching/Tokenizer.scala

package com.twitter.tweetypie.matching

import com.twitter.common.text.language.LocaleUtil
import com.twitter.common_internal.text.pipeline.TwitterTextNormalizer
import com.twitter.common_internal.text.pipeline.TwitterTextTokenizer
import com.twitter.common_internal.text.version.PenguinVersion
import com.twitter.concurrent.Once
import com.twitter.io.StreamIO
import java.util.Locale
import scala.collection.JavaConverters._

/**
 * Extract a sequence of normalized tokens from the input text. The
 * normalization and tokenization are properly configured for keyword
 * matching between texts.
 */
trait Tokenizer {
  def tokenize(input: String): TokenSequence
}

object Tokenizer {

  /**
   * When a Penguin version is not explicitly specified, use this
   * version of Penguin to perform normalization and tokenization. If
   * you cache tokenized text, be sure to store the version as well, to
   * avoid comparing text that was normalized with different algorithms.
   */
  val DefaultPenguinVersion: PenguinVersion = PenguinVersion.PENGUIN_6

  /**
   * If you already know the locale of the text that is being tokenized,
   * use this method to get a tokenizer that is much more efficient than
   * the Tweet or Query tokenizer, since it does not have to perform
   * language detection.
   */
  def forLocale(locale: Locale): Tokenizer = get(locale, DefaultPenguinVersion)

  /**
   * Obtain a `Tokenizer` that will tokenize the text for the given
   * locale and version of the Penguin library.
   */
  def get(locale: Locale, version: PenguinVersion): Tokenizer =
    TokenizerFactories(version).forLocale(locale)

  /**
   * Encapsulates the configuration and use of [[TwitterTextTokenizer]]
   * and [[TwitterTextNormalizer]].
   */
  private[this] class TokenizerFactory(version: PenguinVersion) {
    // The normalizer is thread-safe, so share one instance.
    private[this] val normalizer =
      (new TwitterTextNormalizer.Builder(version)).build()

    // The TwitterTextTokenizer is relatively expensive to build,
    // and is not thread safe, so keep instances of it in a
    // ThreadLocal.
    private[this] val local =
      new ThreadLocal[TwitterTextTokenizer] {
        override def initialValue: TwitterTextTokenizer =
          (new TwitterTextTokenizer.Builder(version)).build()
      }

    /**
     * Obtain a [[Tokenizer]] for this combination of [[PenguinVersion]]
     * and [[Locale]].
     */
    def forLocale(locale: Locale): Tokenizer =
      new Tokenizer {
        override def tokenize(input: String): TokenSequence = {
          val stream = local.get.getTwitterTokenStreamFor(locale)
          stream.reset(normalizer.normalize(input, locale))
          val builder = IndexedSeq.newBuilder[CharSequence]
          while (stream.incrementToken) builder += stream.term()
          TokenSequence(builder.result())
        }
      }
  }

  /**
   * Since there are a small number of Penguin versions, eagerly
   * initialize a TokenizerFactory for each version, to avoid managing
   * mutable state.
   */
  private[this] val TokenizerFactories: PenguinVersion => TokenizerFactory =
    PenguinVersion.values.map(v => v -> new TokenizerFactory(v)).toMap

  /**
   * The set of locales used in warmup. These locales are mentioned in
   * the logic of TwitterTextTokenizer and TwitterTextNormalizer.
   */
  private[this] val WarmUpLocales: Seq[Locale] =
    Seq
      .concat(
        Seq(
          Locale.JAPANESE,
          Locale.KOREAN,
          LocaleUtil.UNKNOWN,
          LocaleUtil.THAI,
          LocaleUtil.ARABIC,
          LocaleUtil.SWEDISH
        ),
        LocaleUtil.CHINESE_JAPANESE_LOCALES.asScala,
        LocaleUtil.CJK_LOCALES.asScala
      )
      .toSet
      .toArray
      .toSeq

  /**
   * Load the default inputs that are used for warming up this library.
   */
  def warmUpCorpus(): Seq[String] = {
    val stream = getClass.getResourceAsStream("warmup-text.txt")
    val bytes =
      try StreamIO.buffer(stream)
      finally stream.close()
    bytes.toString("UTF-8").linesIterator.toArray.toSeq
  }

  /**
   * Exercise the functionality of this library on the specified
   * strings. In general, prefer [[warmUp]] to this method.
   */
  def warmUpWith(ver: PenguinVersion, texts: Iterable[String]): Unit =
    texts.foreach { txt =>
      // Exercise each locale
      WarmUpLocales.foreach { loc =>
        Tokenizer.get(loc, ver).tokenize(txt)
        UserMutes.builder().withPenguinVersion(ver).withLocale(loc).validate(txt)
      }

      // Exercise language detection
      TweetTokenizer.get(ver).tokenize(txt)
      UserMutes.builder().withPenguinVersion(ver).validate(txt)
    }

  private[this] val warmUpOnce = Once(warmUpWith(DefaultPenguinVersion, warmUpCorpus()))

  /**
   * The creation of the first TwitterTextTokenizer is relatively
   * expensive, and tokenizing some texts may cause significant
   * initialization.
   *
   * This method exercises the functionality of this library
   * with a range of texts in order to perform as much initialization as
   * possible before the library is used in a latency-sensitive way.
   *
   * The warmup routine will only run once. Subsequent invocations of
   * `warmUp` will no do additional work, and will return once warmup is
   * complete.
   *
   * The warmup will take on the order of seconds.
   */
  def warmUp(): Unit = warmUpOnce()
}