kopia lustrzana https://github.com/twitter/the-algorithm
157 wiersze
5.3 KiB
Scala
157 wiersze
5.3 KiB
Scala
package com.twitter.tweetypie.matching
|
|
|
|
import com.twitter.common.text.language.LocaleUtil
|
|
import com.twitter.common_internal.text.pipeline.TwitterTextNormalizer
|
|
import com.twitter.common_internal.text.pipeline.TwitterTextTokenizer
|
|
import com.twitter.common_internal.text.version.PenguinVersion
|
|
import com.twitter.concurrent.Once
|
|
import com.twitter.io.StreamIO
|
|
import java.util.Locale
|
|
import scala.collection.JavaConverters._
|
|
|
|
/**
|
|
* Extract a sequence of normalized tokens from the input text. The
|
|
* normalization and tokenization are properly configured for keyword
|
|
* matching between texts.
|
|
*/
|
|
trait Tokenizer {
|
|
def tokenize(input: String): TokenSequence
|
|
}
|
|
|
|
object Tokenizer {
|
|
|
|
/**
|
|
* When a Penguin version is not explicitly specified, use this
|
|
* version of Penguin to perform normalization and tokenization. If
|
|
* you cache tokenized text, be sure to store the version as well, to
|
|
* avoid comparing text that was normalized with different algorithms.
|
|
*/
|
|
val DefaultPenguinVersion: PenguinVersion = PenguinVersion.PENGUIN_6
|
|
|
|
/**
|
|
* If you already know the locale of the text that is being tokenized,
|
|
* use this method to get a tokenizer that is much more efficient than
|
|
* the Tweet or Query tokenizer, since it does not have to perform
|
|
* language detection.
|
|
*/
|
|
def forLocale(locale: Locale): Tokenizer = get(locale, DefaultPenguinVersion)
|
|
|
|
/**
|
|
* Obtain a `Tokenizer` that will tokenize the text for the given
|
|
* locale and version of the Penguin library.
|
|
*/
|
|
def get(locale: Locale, version: PenguinVersion): Tokenizer =
|
|
TokenizerFactories(version).forLocale(locale)
|
|
|
|
/**
|
|
* Encapsulates the configuration and use of [[TwitterTextTokenizer]]
|
|
* and [[TwitterTextNormalizer]].
|
|
*/
|
|
private[this] class TokenizerFactory(version: PenguinVersion) {
|
|
// The normalizer is thread-safe, so share one instance.
|
|
private[this] val normalizer =
|
|
(new TwitterTextNormalizer.Builder(version)).build()
|
|
|
|
// The TwitterTextTokenizer is relatively expensive to build,
|
|
// and is not thread safe, so keep instances of it in a
|
|
// ThreadLocal.
|
|
private[this] val local =
|
|
new ThreadLocal[TwitterTextTokenizer] {
|
|
override def initialValue: TwitterTextTokenizer =
|
|
(new TwitterTextTokenizer.Builder(version)).build()
|
|
}
|
|
|
|
/**
|
|
* Obtain a [[Tokenizer]] for this combination of [[PenguinVersion]]
|
|
* and [[Locale]].
|
|
*/
|
|
def forLocale(locale: Locale): Tokenizer =
|
|
new Tokenizer {
|
|
override def tokenize(input: String): TokenSequence = {
|
|
val stream = local.get.getTwitterTokenStreamFor(locale)
|
|
stream.reset(normalizer.normalize(input, locale))
|
|
val builder = IndexedSeq.newBuilder[CharSequence]
|
|
while (stream.incrementToken) builder += stream.term()
|
|
TokenSequence(builder.result())
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Since there are a small number of Penguin versions, eagerly
|
|
* initialize a TokenizerFactory for each version, to avoid managing
|
|
* mutable state.
|
|
*/
|
|
private[this] val TokenizerFactories: PenguinVersion => TokenizerFactory =
|
|
PenguinVersion.values.map(v => v -> new TokenizerFactory(v)).toMap
|
|
|
|
/**
|
|
* The set of locales used in warmup. These locales are mentioned in
|
|
* the logic of TwitterTextTokenizer and TwitterTextNormalizer.
|
|
*/
|
|
private[this] val WarmUpLocales: Seq[Locale] =
|
|
Seq
|
|
.concat(
|
|
Seq(
|
|
Locale.JAPANESE,
|
|
Locale.KOREAN,
|
|
LocaleUtil.UNKNOWN,
|
|
LocaleUtil.THAI,
|
|
LocaleUtil.ARABIC,
|
|
LocaleUtil.SWEDISH
|
|
),
|
|
LocaleUtil.CHINESE_JAPANESE_LOCALES.asScala,
|
|
LocaleUtil.CJK_LOCALES.asScala
|
|
)
|
|
.toSet
|
|
.toArray
|
|
.toSeq
|
|
|
|
/**
|
|
* Load the default inputs that are used for warming up this library.
|
|
*/
|
|
def warmUpCorpus(): Seq[String] = {
|
|
val stream = getClass.getResourceAsStream("warmup-text.txt")
|
|
val bytes =
|
|
try StreamIO.buffer(stream)
|
|
finally stream.close()
|
|
bytes.toString("UTF-8").linesIterator.toArray.toSeq
|
|
}
|
|
|
|
/**
|
|
* Exercise the functionality of this library on the specified
|
|
* strings. In general, prefer [[warmUp]] to this method.
|
|
*/
|
|
def warmUpWith(ver: PenguinVersion, texts: Iterable[String]): Unit =
|
|
texts.foreach { txt =>
|
|
// Exercise each locale
|
|
WarmUpLocales.foreach { loc =>
|
|
Tokenizer.get(loc, ver).tokenize(txt)
|
|
UserMutes.builder().withPenguinVersion(ver).withLocale(loc).validate(txt)
|
|
}
|
|
|
|
// Exercise language detection
|
|
TweetTokenizer.get(ver).tokenize(txt)
|
|
UserMutes.builder().withPenguinVersion(ver).validate(txt)
|
|
}
|
|
|
|
private[this] val warmUpOnce = Once(warmUpWith(DefaultPenguinVersion, warmUpCorpus()))
|
|
|
|
/**
|
|
* The creation of the first TwitterTextTokenizer is relatively
|
|
* expensive, and tokenizing some texts may cause significant
|
|
* initialization.
|
|
*
|
|
* This method exercises the functionality of this library
|
|
* with a range of texts in order to perform as much initialization as
|
|
* possible before the library is used in a latency-sensitive way.
|
|
*
|
|
* The warmup routine will only run once. Subsequent invocations of
|
|
* `warmUp` will no do additional work, and will return once warmup is
|
|
* complete.
|
|
*
|
|
* The warmup will take on the order of seconds.
|
|
*/
|
|
def warmUp(): Unit = warmUpOnce()
|
|
}
|