From 43cdcf2ed620721911b696223a482c5d2934aa82 Mon Sep 17 00:00:00 2001 From: twitter-team <> Date: Tue, 18 Apr 2023 13:04:42 -0700 Subject: [PATCH] Open-sourcing Representation Manager Representation Manager (RMS) serves as a centralized embedding management system, providing SimClusters or other embeddings as facade of the underlying storage or services. --- representation-manager/BUILD.bazel | 1 + representation-manager/README.md | 4 + representation-manager/bin/deploy.sh | 4 + .../com/twitter/representation_manager/BUILD | 17 + .../representation_manager/StoreBuilder.scala | 208 +++++ .../representation_manager/config/BUILD | 12 + .../config/ClientConfig.scala | 25 + .../config/InMemoryCacheConfig.scala | 53 ++ representation-manager/server/BUILD | 21 + .../server/src/main/resources/BUILD | 7 + .../src/main/resources/config/decider.yml | 219 +++++ .../server/src/main/resources/logback.xml | 165 ++++ .../com/twitter/representation_manager/BUILD | 13 + .../RepresentationManagerFedServer.scala | 40 + .../representation_manager/columns/BUILD | 9 + .../columns/ColumnConfigBase.scala | 26 + .../columns/topic/BUILD | 14 + ...ocaleEntityIdSimClustersEmbeddingCol.scala | 77 ++ .../TopicIdSimClustersEmbeddingCol.scala | 74 ++ .../columns/tweet/BUILD | 14 + .../tweet/TweetSimClustersEmbeddingCol.scala | 73 ++ .../representation_manager/columns/user/BUILD | 14 + .../user/UserSimClustersEmbeddingCol.scala | 73 ++ .../representation_manager/common/BUILD | 13 + .../common/MemCacheConfig.scala | 153 ++++ .../common/RepresentationManagerDecider.scala | 25 + .../representation_manager/migration/BUILD | 25 + .../migration/LegacyRMS.scala | 846 ++++++++++++++++++ .../representation_manager/modules/BUILD | 18 + .../modules/CacheModule.scala | 34 + .../modules/InterestsThriftClientModule.scala | 40 + .../modules/LegacyRMSConfigModule.scala | 18 + .../modules/StoreModule.scala | 24 + .../modules/TimerModule.scala | 13 + .../modules/UttClientModule.scala | 39 + .../representation_manager/store/BUILD | 16 + .../store/DeciderConstants.scala | 39 + .../TopicSimClustersEmbeddingStore.scala | 198 ++++ .../TweetSimClustersEmbeddingStore.scala | 141 +++ .../store/UserSimClustersEmbeddingStore.scala | 602 +++++++++++++ .../server/src/main/thrift/BUILD | 18 + .../representation_manager/service.thrift | 14 + 42 files changed, 3439 insertions(+) create mode 100644 representation-manager/BUILD.bazel create mode 100644 representation-manager/README.md create mode 100755 representation-manager/bin/deploy.sh create mode 100644 representation-manager/client/src/main/scala/com/twitter/representation_manager/BUILD create mode 100644 representation-manager/client/src/main/scala/com/twitter/representation_manager/StoreBuilder.scala create mode 100644 representation-manager/client/src/main/scala/com/twitter/representation_manager/config/BUILD create mode 100644 representation-manager/client/src/main/scala/com/twitter/representation_manager/config/ClientConfig.scala create mode 100644 representation-manager/client/src/main/scala/com/twitter/representation_manager/config/InMemoryCacheConfig.scala create mode 100644 representation-manager/server/BUILD create mode 100644 representation-manager/server/src/main/resources/BUILD create mode 100644 representation-manager/server/src/main/resources/config/decider.yml create mode 100644 representation-manager/server/src/main/resources/logback.xml create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/RepresentationManagerFedServer.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/ColumnConfigBase.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/LocaleEntityIdSimClustersEmbeddingCol.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/TopicIdSimClustersEmbeddingCol.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/TweetSimClustersEmbeddingCol.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/UserSimClustersEmbeddingCol.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/common/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/common/MemCacheConfig.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/common/RepresentationManagerDecider.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/LegacyRMS.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/CacheModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/InterestsThriftClientModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/LegacyRMSConfigModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/StoreModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/TimerModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/UttClientModule.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/store/BUILD create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala create mode 100644 representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala create mode 100644 representation-manager/server/src/main/thrift/BUILD create mode 100644 representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift diff --git a/representation-manager/BUILD.bazel b/representation-manager/BUILD.bazel new file mode 100644 index 000000000..1624a57d4 --- /dev/null +++ b/representation-manager/BUILD.bazel @@ -0,0 +1 @@ +# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD diff --git a/representation-manager/README.md b/representation-manager/README.md new file mode 100644 index 000000000..44cd25ee7 --- /dev/null +++ b/representation-manager/README.md @@ -0,0 +1,4 @@ +# Representation Manager # + +**Representation Manager** (RMS) serves as a centralized embedding management system, providing SimClusters or other embeddings as facade of the underlying storage or services. + diff --git a/representation-manager/bin/deploy.sh b/representation-manager/bin/deploy.sh new file mode 100755 index 000000000..5729d9903 --- /dev/null +++ b/representation-manager/bin/deploy.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +JOB=representation-manager bazel run --ui_event_filters=-info,-stdout,-stderr --noshow_progress \ + //relevance-platform/src/main/python/deploy -- "$@" diff --git a/representation-manager/client/src/main/scala/com/twitter/representation_manager/BUILD b/representation-manager/client/src/main/scala/com/twitter/representation_manager/BUILD new file mode 100644 index 000000000..1f69a2176 --- /dev/null +++ b/representation-manager/client/src/main/scala/com/twitter/representation_manager/BUILD @@ -0,0 +1,17 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-thrift-client", + "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/store/strato", + "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", + "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/readablestore", + "representation-manager/client/src/main/scala/com/twitter/representation_manager/config", + "representation-manager/server/src/main/thrift:thrift-scala", + "src/scala/com/twitter/simclusters_v2/common", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + "stitch/stitch-storehaus", + "strato/src/main/scala/com/twitter/strato/client", + ], +) diff --git a/representation-manager/client/src/main/scala/com/twitter/representation_manager/StoreBuilder.scala b/representation-manager/client/src/main/scala/com/twitter/representation_manager/StoreBuilder.scala new file mode 100644 index 000000000..2314a8254 --- /dev/null +++ b/representation-manager/client/src/main/scala/com/twitter/representation_manager/StoreBuilder.scala @@ -0,0 +1,208 @@ +package com.twitter.representation_manager + +import com.twitter.finagle.memcached.{Client => MemcachedClient} +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.frigate.common.store.strato.StratoFetchableStore +import com.twitter.hermit.store.common.ObservedCachedReadableStore +import com.twitter.hermit.store.common.ObservedReadableStore +import com.twitter.representation_manager.config.ClientConfig +import com.twitter.representation_manager.config.DisabledInMemoryCacheParams +import com.twitter.representation_manager.config.EnabledInMemoryCacheParams +import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.LocaleEntityId +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.TopicId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storehaus.ReadableStore +import com.twitter.strato.client.{Client => StratoClient} +import com.twitter.strato.thrift.ScroogeConvImplicits._ + +/** + * This is the class that offers features to build readable stores for a given + * SimClustersEmbeddingView (i.e. embeddingType and modelVersion). It applies ClientConfig + * for a particular service and build ReadableStores which implement that config. + */ +class StoreBuilder( + clientConfig: ClientConfig, + stratoClient: StratoClient, + memCachedClient: MemcachedClient, + globalStats: StatsReceiver, +) { + private val stats = + globalStats.scope("representation_manager_client").scope(this.getClass.getSimpleName) + + // Column consts + private val ColPathPrefix = "recommendations/representation_manager/" + private val SimclustersTweetColPath = ColPathPrefix + "simClustersEmbedding.Tweet" + private val SimclustersUserColPath = ColPathPrefix + "simClustersEmbedding.User" + private val SimclustersTopicIdColPath = ColPathPrefix + "simClustersEmbedding.TopicId" + private val SimclustersLocaleEntityIdColPath = + ColPathPrefix + "simClustersEmbedding.LocaleEntityId" + + def buildSimclustersTweetEmbeddingStore( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[Long, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[Long, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersTweetColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + + addCacheLayer(rawStore, embeddingColumnView) + } + + def buildSimclustersUserEmbeddingStore( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[Long, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[Long, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersUserColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + + addCacheLayer(rawStore, embeddingColumnView) + } + + def buildSimclustersTopicIdEmbeddingStore( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[TopicId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[TopicId, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersTopicIdColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + + addCacheLayer(rawStore, embeddingColumnView) + } + + def buildSimclustersLocaleEntityIdEmbeddingStore( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[LocaleEntityId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[LocaleEntityId, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersLocaleEntityIdColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + + addCacheLayer(rawStore, embeddingColumnView) + } + + def buildSimclustersTweetEmbeddingStoreWithEmbeddingIdAsKey( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[Long, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersTweetColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + val embeddingIdAsKeyStore = rawStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.TweetId(tweetId)) => + tweetId + } + + addCacheLayer(embeddingIdAsKeyStore, embeddingColumnView) + } + + def buildSimclustersUserEmbeddingStoreWithEmbeddingIdAsKey( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[Long, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersUserColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + val embeddingIdAsKeyStore = rawStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => + userId + } + + addCacheLayer(embeddingIdAsKeyStore, embeddingColumnView) + } + + def buildSimclustersTopicEmbeddingStoreWithEmbeddingIdAsKey( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[TopicId, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersTopicIdColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + val embeddingIdAsKeyStore = rawStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.TopicId(topicId)) => + topicId + } + + addCacheLayer(embeddingIdAsKeyStore, embeddingColumnView) + } + + def buildSimclustersTopicIdEmbeddingStoreWithEmbeddingIdAsKey( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[TopicId, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersTopicIdColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + val embeddingIdAsKeyStore = rawStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.TopicId(topicId)) => + topicId + } + + addCacheLayer(embeddingIdAsKeyStore, embeddingColumnView) + } + + def buildSimclustersLocaleEntityIdEmbeddingStoreWithEmbeddingIdAsKey( + embeddingColumnView: SimClustersEmbeddingView + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val rawStore = StratoFetchableStore + .withView[LocaleEntityId, SimClustersEmbeddingView, ThriftSimClustersEmbedding]( + stratoClient, + SimclustersLocaleEntityIdColPath, + embeddingColumnView) + .mapValues(SimClustersEmbedding(_)) + val embeddingIdAsKeyStore = rawStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.LocaleEntityId(localeEntityId)) => + localeEntityId + } + + addCacheLayer(embeddingIdAsKeyStore, embeddingColumnView) + } + + private def addCacheLayer[K]( + rawStore: ReadableStore[K, SimClustersEmbedding], + embeddingColumnView: SimClustersEmbeddingView, + ): ReadableStore[K, SimClustersEmbedding] = { + // Add in-memory caching based on ClientConfig + val inMemCacheParams = clientConfig.inMemoryCacheConfig + .getCacheSetup(embeddingColumnView.embeddingType, embeddingColumnView.modelVersion) + + val statsPerStore = stats + .scope(embeddingColumnView.embeddingType.name).scope(embeddingColumnView.modelVersion.name) + + inMemCacheParams match { + case DisabledInMemoryCacheParams => + ObservedReadableStore( + store = rawStore + )(statsPerStore) + case EnabledInMemoryCacheParams(ttl, maxKeys, cacheName) => + ObservedCachedReadableStore.from[K, SimClustersEmbedding]( + rawStore, + ttl = ttl, + maxKeys = maxKeys, + cacheName = cacheName, + windowSize = 10000L + )(statsPerStore) + } + } + +} diff --git a/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/BUILD b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/BUILD new file mode 100644 index 000000000..8418563d5 --- /dev/null +++ b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/BUILD @@ -0,0 +1,12 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-thrift-client", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/common", + "representation-manager/server/src/main/thrift:thrift-scala", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + "strato/src/main/scala/com/twitter/strato/client", + ], +) diff --git a/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/ClientConfig.scala b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/ClientConfig.scala new file mode 100644 index 000000000..9ae0c49e7 --- /dev/null +++ b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/ClientConfig.scala @@ -0,0 +1,25 @@ +package com.twitter.representation_manager.config + +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.ModelVersion + +/* + * This is RMS client config class. + * We only support setting up in memory cache params for now, but we expect to enable other + * customisations in the near future e.g. request timeout + * + * -------------------------------------------- + * PLEASE NOTE: + * Having in-memory cache is not necessarily a free performance win, anyone considering it should + * investigate rather than blindly enabling it + * */ +class ClientConfig(inMemCacheParamsOverrides: Map[ + (EmbeddingType, ModelVersion), + InMemoryCacheParams +] = Map.empty) { + // In memory cache config per embedding + val inMemCacheParams = DefaultInMemoryCacheConfig.cacheParamsMap ++ inMemCacheParamsOverrides + val inMemoryCacheConfig = new InMemoryCacheConfig(inMemCacheParams) +} + +object DefaultClientConfig extends ClientConfig diff --git a/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/InMemoryCacheConfig.scala b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/InMemoryCacheConfig.scala new file mode 100644 index 000000000..eab569b51 --- /dev/null +++ b/representation-manager/client/src/main/scala/com/twitter/representation_manager/config/InMemoryCacheConfig.scala @@ -0,0 +1,53 @@ +package com.twitter.representation_manager.config + +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.util.Duration + +/* + * -------------------------------------------- + * PLEASE NOTE: + * Having in-memory cache is not necessarily a free performance win, anyone considering it should + * investigate rather than blindly enabling it + * -------------------------------------------- + * */ + +sealed trait InMemoryCacheParams + +/* + * This holds params that is required to set up a in-mem cache for a single embedding store + */ +case class EnabledInMemoryCacheParams( + ttl: Duration, + maxKeys: Int, + cacheName: String) + extends InMemoryCacheParams +object DisabledInMemoryCacheParams extends InMemoryCacheParams + +/* + * This is the class for the in-memory cache config. Client could pass in their own cacheParamsMap to + * create a new InMemoryCacheConfig instead of using the DefaultInMemoryCacheConfig object below + * */ +class InMemoryCacheConfig( + cacheParamsMap: Map[ + (EmbeddingType, ModelVersion), + InMemoryCacheParams + ] = Map.empty) { + + def getCacheSetup( + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): InMemoryCacheParams = { + // When requested embedding type doesn't exist, we return DisabledInMemoryCacheParams + cacheParamsMap.getOrElse((embeddingType, modelVersion), DisabledInMemoryCacheParams) + } +} + +/* + * Default config for the in-memory cache + * Clients can directly import and use this one if they don't want to set up a customised config + * */ +object DefaultInMemoryCacheConfig extends InMemoryCacheConfig { + // set default to no in-memory caching + val cacheParamsMap = Map.empty +} diff --git a/representation-manager/server/BUILD b/representation-manager/server/BUILD new file mode 100644 index 000000000..427fc1d3b --- /dev/null +++ b/representation-manager/server/BUILD @@ -0,0 +1,21 @@ +jvm_binary( + name = "bin", + basename = "representation-manager", + main = "com.twitter.representation_manager.RepresentationManagerFedServerMain", + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-logback/src/main/scala", + "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback", + "representation-manager/server/src/main/resources", + "representation-manager/server/src/main/scala/com/twitter/representation_manager", + "twitter-server/logback-classic/src/main/scala", + ], +) + +# Aurora Workflows build phase convention requires a jvm_app named with ${project-name}-app +jvm_app( + name = "representation-manager-app", + archive = "zip", + binary = ":bin", +) diff --git a/representation-manager/server/src/main/resources/BUILD b/representation-manager/server/src/main/resources/BUILD new file mode 100644 index 000000000..b3a752276 --- /dev/null +++ b/representation-manager/server/src/main/resources/BUILD @@ -0,0 +1,7 @@ +resources( + sources = [ + "*.xml", + "config/*.yml", + ], + tags = ["bazel-compatible"], +) diff --git a/representation-manager/server/src/main/resources/config/decider.yml b/representation-manager/server/src/main/resources/config/decider.yml new file mode 100644 index 000000000..e75ebf89d --- /dev/null +++ b/representation-manager/server/src/main/resources/config/decider.yml @@ -0,0 +1,219 @@ +# ---------- traffic percentage by embedding type and model version ---------- +# Decider strings are build dynamically following the rule in there +# i.e. s"enable_${embeddingType.name}_${modelVersion.name}" +# Hence this should be updated accordingly if usage is changed in the embedding stores + +# Tweet embeddings +"enable_LogFavBasedTweet_Model20m145k2020": + comment: "Enable x% read traffic (0<=x<=10000, e.g. 1000=10%) for LogFavBasedTweet - Model20m145k2020. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedTweet_Model20m145kUpdated": + comment: "Enable x% read traffic (0<=x<=10000, e.g. 1000=10%) for LogFavBasedTweet - Model20m145kUpdated. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavLongestL2EmbeddingTweet_Model20m145k2020": + comment: "Enable x% read traffic (0<=x<=10000, e.g. 1000=10%) for LogFavLongestL2EmbeddingTweet - Model20m145k2020. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavLongestL2EmbeddingTweet_Model20m145kUpdated": + comment: "Enable x% read traffic (0<=x<=10000, e.g. 1000=10%) for LogFavLongestL2EmbeddingTweet - Model20m145kUpdated. 0 means return EMPTY for all requests." + default_availability: 10000 + +# Topic embeddings +"enable_FavTfgTopic_Model20m145k2020": + comment: "Enable the read traffic to FavTfgTopic - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedKgoApeTopic_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedKgoApeTopic - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +# User embeddings - KnownFor +"enable_FavBasedProducer_Model20m145kUpdated": + comment: "Enable the read traffic to FavBasedProducer - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FavBasedProducer_Model20m145k2020": + comment: "Enable the read traffic to FavBasedProducer - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FollowBasedProducer_Model20m145k2020": + comment: "Enable the read traffic to FollowBasedProducer - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_AggregatableFavBasedProducer_Model20m145kUpdated": + comment: "Enable the read traffic to AggregatableFavBasedProducer - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_AggregatableFavBasedProducer_Model20m145k2020": + comment: "Enable the read traffic to AggregatableFavBasedProducer - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_AggregatableLogFavBasedProducer_Model20m145kUpdated": + comment: "Enable the read traffic to AggregatableLogFavBasedProducer - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_AggregatableLogFavBasedProducer_Model20m145k2020": + comment: "Enable the read traffic to AggregatableLogFavBasedProducer - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +enable_RelaxedAggregatableLogFavBasedProducer_Model20m145kUpdated: + comment: "Enable the read traffic to RelaxedAggregatableLogFavBasedProducer - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +enable_RelaxedAggregatableLogFavBasedProducer_Model20m145k2020: + comment: "Enable the read traffic to RelaxedAggregatableLogFavBasedProducer - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +# User embeddings - InterestedIn +"enable_LogFavBasedUserInterestedInFromAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedInFromAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FollowBasedUserInterestedInFromAPE_Model20m145k2020": + comment: "Enable the read traffic to FollowBasedUserInterestedInFromAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FavBasedUserInterestedIn_Model20m145kUpdated": + comment: "Enable the read traffic to FavBasedUserInterestedIn - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FavBasedUserInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to FavBasedUserInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FollowBasedUserInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to FollowBasedUserInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FavBasedUserInterestedInFromPE_Model20m145kUpdated": + comment: "Enable the read traffic to FavBasedUserInterestedInFromPE - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FilteredUserInterestedIn_Model20m145kUpdated": + comment: "Enable the read traffic to FilteredUserInterestedIn - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FilteredUserInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to FilteredUserInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_FilteredUserInterestedInFromPE_Model20m145kUpdated": + comment: "Enable the read traffic to FilteredUserInterestedInFromPE - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_UnfilteredUserInterestedIn_Model20m145kUpdated": + comment: "Enable the read traffic to UnfilteredUserInterestedIn - Model20m145kUpdated from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_UnfilteredUserInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to UnfilteredUserInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_UserNextInterestedIn_Model20m145k2020": + comment: "Enable the read traffic to UserNextInterestedIn - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedAverageAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedAverageAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedAverageAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +"enable_LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE_Model20m145k2020": + comment: "Enable the read traffic to LogFavBasedUserInterestedAverageAddressBookFromIIAPE - Model20m145k2020 from 0% to 100%. 0 means return EMPTY for all requests." + default_availability: 10000 + +# ---------- load shedding by caller id ---------- +# To create a new decider, add here with the same format and caller's details : +# "representation-manager_load_shed_by_caller_id_twtr:{{role}}:{{name}}:{{environment}}:{{cluster}}" +# All the deciders below are generated by this script: +# ./strato/bin/fed deciders representation-manager --service-role=representation-manager --service-name=representation-manager +# If you need to run the script and paste the output, add ONLY the prod deciders here. +"representation-manager_load_shed_by_caller_id_all": + comment: "Reject all traffic from caller id: all" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:cr-mixer:cr-mixer:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:cr-mixer:cr-mixer:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:cr-mixer:cr-mixer:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:cr-mixer:cr-mixer:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-1:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-1:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-1:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-1:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-3:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-3:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-3:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-3:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-4:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-4:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-4:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-4:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-experimental:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-experimental:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann-experimental:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann-experimental:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:simclusters-ann:simclusters-ann:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:simclusters-ann:simclusters-ann:prod:pdxa" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:stratostore:stratoapi:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoapi:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:atla": + comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:atla" + default_availability: 0 + +"representation-manager_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:pdxa": + comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:pdxa" + default_availability: 0 + +# ---------- Dark Traffic Proxy ---------- +representation-manager_forward_dark_traffic: + comment: "Defines the percentage of traffic to forward to diffy-proxy. Set to 0 to disable dark traffic forwarding" + default_availability: 0 diff --git a/representation-manager/server/src/main/resources/logback.xml b/representation-manager/server/src/main/resources/logback.xml new file mode 100644 index 000000000..47b3ed16d --- /dev/null +++ b/representation-manager/server/src/main/resources/logback.xml @@ -0,0 +1,165 @@ + + + + + + + + + + + + + + + + + true + + + + + + + + + + + ${log.service.output} + + + ${log.service.output}.%d.gz + + 3GB + + 21 + true + + + %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n + + + + + + ${log.access.output} + + + ${log.access.output}.%d.gz + + 100MB + + 7 + true + + + ${DEFAULT_ACCESS_PATTERN}%n + + + + + + true + ${log.lens.category} + ${log.lens.index} + ${log.lens.tag}/service + + %msg + + + + + + true + ${log.lens.category} + ${log.lens.index} + ${log.lens.tag}/access + + %msg + + + + + + allow_listed_pipeline_executions.log + + + allow_listed_pipeline_executions.log.%d.gz + + 100MB + + 7 + true + + + %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n + + + + + + + + + + + + ${async_queue_size} + ${async_max_flush_time} + + + + + ${async_queue_size} + ${async_max_flush_time} + + + + + ${async_queue_size} + ${async_max_flush_time} + + + + + ${async_queue_size} + ${async_max_flush_time} + + + + + ${async_queue_size} + ${async_max_flush_time} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/BUILD new file mode 100644 index 000000000..d8ca301f6 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/BUILD @@ -0,0 +1,13 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-thrift-client", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user", + "strato/src/main/scala/com/twitter/strato/fed", + "strato/src/main/scala/com/twitter/strato/fed/server", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/RepresentationManagerFedServer.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/RepresentationManagerFedServer.scala new file mode 100644 index 000000000..5bc820bb4 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/RepresentationManagerFedServer.scala @@ -0,0 +1,40 @@ +package com.twitter.representation_manager + +import com.google.inject.Module +import com.twitter.inject.thrift.modules.ThriftClientIdModule +import com.twitter.representation_manager.columns.topic.LocaleEntityIdSimClustersEmbeddingCol +import com.twitter.representation_manager.columns.topic.TopicIdSimClustersEmbeddingCol +import com.twitter.representation_manager.columns.tweet.TweetSimClustersEmbeddingCol +import com.twitter.representation_manager.columns.user.UserSimClustersEmbeddingCol +import com.twitter.representation_manager.modules.CacheModule +import com.twitter.representation_manager.modules.InterestsThriftClientModule +import com.twitter.representation_manager.modules.LegacyRMSConfigModule +import com.twitter.representation_manager.modules.StoreModule +import com.twitter.representation_manager.modules.TimerModule +import com.twitter.representation_manager.modules.UttClientModule +import com.twitter.strato.fed._ +import com.twitter.strato.fed.server._ + +object RepresentationManagerFedServerMain extends RepresentationManagerFedServer + +trait RepresentationManagerFedServer extends StratoFedServer { + override def dest: String = "/s/representation-manager/representation-manager" + override val modules: Seq[Module] = + Seq( + CacheModule, + InterestsThriftClientModule, + LegacyRMSConfigModule, + StoreModule, + ThriftClientIdModule, + TimerModule, + UttClientModule + ) + + override def columns: Seq[Class[_ <: StratoFed.Column]] = + Seq( + classOf[TweetSimClustersEmbeddingCol], + classOf[UserSimClustersEmbeddingCol], + classOf[TopicIdSimClustersEmbeddingCol], + classOf[LocaleEntityIdSimClustersEmbeddingCol] + ) +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/BUILD new file mode 100644 index 000000000..6ebd77ef8 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/BUILD @@ -0,0 +1,9 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "strato/src/main/scala/com/twitter/strato/fed", + "strato/src/main/scala/com/twitter/strato/fed/server", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/ColumnConfigBase.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/ColumnConfigBase.scala new file mode 100644 index 000000000..143ccdc4c --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/ColumnConfigBase.scala @@ -0,0 +1,26 @@ +package com.twitter.representation_manager.columns + +import com.twitter.strato.access.Access.LdapGroup +import com.twitter.strato.config.ContactInfo +import com.twitter.strato.config.FromColumns +import com.twitter.strato.config.Has +import com.twitter.strato.config.Prefix +import com.twitter.strato.config.ServiceIdentifierPattern + +object ColumnConfigBase { + + /****************** Internal permissions *******************/ + val recosPermissions: Seq[com.twitter.strato.config.Policy] = Seq() + + /****************** External permissions *******************/ + // This is used to grant limited access to members outside of RP team. + val externalPermissions: Seq[com.twitter.strato.config.Policy] = Seq() + + val contactInfo: ContactInfo = ContactInfo( + description = "Please contact Relevance Platform for more details", + contactEmail = "no-reply@twitter.com", + ldapGroup = "ldap", + jiraProject = "JIRA", + links = Seq("http://go/rms-runbook") + ) +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/BUILD new file mode 100644 index 000000000..26022ebe5 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/BUILD @@ -0,0 +1,14 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-core/src/main/scala", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/modules", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/store", + "representation-manager/server/src/main/thrift:thrift-scala", + "strato/src/main/scala/com/twitter/strato/fed", + "strato/src/main/scala/com/twitter/strato/fed/server", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/LocaleEntityIdSimClustersEmbeddingCol.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/LocaleEntityIdSimClustersEmbeddingCol.scala new file mode 100644 index 000000000..7b7952300 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/LocaleEntityIdSimClustersEmbeddingCol.scala @@ -0,0 +1,77 @@ +package com.twitter.representation_manager.columns.topic + +import com.twitter.representation_manager.columns.ColumnConfigBase +import com.twitter.representation_manager.store.TopicSimClustersEmbeddingStore +import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.LocaleEntityId +import com.twitter.stitch +import com.twitter.stitch.Stitch +import com.twitter.stitch.storehaus.StitchOfReadableStore +import com.twitter.strato.catalog.OpMetadata +import com.twitter.strato.config.AnyOf +import com.twitter.strato.config.ContactInfo +import com.twitter.strato.config.FromColumns +import com.twitter.strato.config.Policy +import com.twitter.strato.config.Prefix +import com.twitter.strato.data.Conv +import com.twitter.strato.data.Description.PlainText +import com.twitter.strato.data.Lifecycle +import com.twitter.strato.fed._ +import com.twitter.strato.thrift.ScroogeConv +import javax.inject.Inject + +class LocaleEntityIdSimClustersEmbeddingCol @Inject() ( + embeddingStore: TopicSimClustersEmbeddingStore) + extends StratoFed.Column( + "recommendations/representation_manager/simClustersEmbedding.LocaleEntityId") + with StratoFed.Fetch.Stitch { + + private val storeStitch: SimClustersEmbeddingId => Stitch[SimClustersEmbedding] = + StitchOfReadableStore(embeddingStore.topicSimClustersEmbeddingStore.mapValues(_.toThrift)) + + val colPermissions: Seq[com.twitter.strato.config.Policy] = + ColumnConfigBase.recosPermissions ++ ColumnConfigBase.externalPermissions :+ FromColumns( + Set( + Prefix("ml/featureStore/simClusters"), + )) + + override val policy: Policy = AnyOf({ + colPermissions + }) + + override type Key = LocaleEntityId + override type View = SimClustersEmbeddingView + override type Value = SimClustersEmbedding + + override val keyConv: Conv[Key] = ScroogeConv.fromStruct[LocaleEntityId] + override val viewConv: Conv[View] = ScroogeConv.fromStruct[SimClustersEmbeddingView] + override val valueConv: Conv[Value] = ScroogeConv.fromStruct[SimClustersEmbedding] + + override val contactInfo: ContactInfo = ColumnConfigBase.contactInfo + + override val metadata: OpMetadata = OpMetadata( + lifecycle = Some(Lifecycle.Production), + description = Some( + PlainText( + "The Topic SimClusters Embedding Endpoint in Representation Management Service with LocaleEntityId." + + " TDD: http://go/rms-tdd")) + ) + + override def fetch(key: Key, view: View): Stitch[Result[Value]] = { + val embeddingId = SimClustersEmbeddingId( + view.embeddingType, + view.modelVersion, + InternalId.LocaleEntityId(key) + ) + + storeStitch(embeddingId) + .map(embedding => found(embedding)) + .handle { + case stitch.NotFound => missing + } + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/TopicIdSimClustersEmbeddingCol.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/TopicIdSimClustersEmbeddingCol.scala new file mode 100644 index 000000000..4afddbb4c --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/topic/TopicIdSimClustersEmbeddingCol.scala @@ -0,0 +1,74 @@ +package com.twitter.representation_manager.columns.topic + +import com.twitter.representation_manager.columns.ColumnConfigBase +import com.twitter.representation_manager.store.TopicSimClustersEmbeddingStore +import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.TopicId +import com.twitter.stitch +import com.twitter.stitch.Stitch +import com.twitter.stitch.storehaus.StitchOfReadableStore +import com.twitter.strato.catalog.OpMetadata +import com.twitter.strato.config.AnyOf +import com.twitter.strato.config.ContactInfo +import com.twitter.strato.config.FromColumns +import com.twitter.strato.config.Policy +import com.twitter.strato.config.Prefix +import com.twitter.strato.data.Conv +import com.twitter.strato.data.Description.PlainText +import com.twitter.strato.data.Lifecycle +import com.twitter.strato.fed._ +import com.twitter.strato.thrift.ScroogeConv +import javax.inject.Inject + +class TopicIdSimClustersEmbeddingCol @Inject() (embeddingStore: TopicSimClustersEmbeddingStore) + extends StratoFed.Column("recommendations/representation_manager/simClustersEmbedding.TopicId") + with StratoFed.Fetch.Stitch { + + private val storeStitch: SimClustersEmbeddingId => Stitch[SimClustersEmbedding] = + StitchOfReadableStore(embeddingStore.topicSimClustersEmbeddingStore.mapValues(_.toThrift)) + + val colPermissions: Seq[com.twitter.strato.config.Policy] = + ColumnConfigBase.recosPermissions ++ ColumnConfigBase.externalPermissions :+ FromColumns( + Set( + Prefix("ml/featureStore/simClusters"), + )) + + override val policy: Policy = AnyOf({ + colPermissions + }) + + override type Key = TopicId + override type View = SimClustersEmbeddingView + override type Value = SimClustersEmbedding + + override val keyConv: Conv[Key] = ScroogeConv.fromStruct[TopicId] + override val viewConv: Conv[View] = ScroogeConv.fromStruct[SimClustersEmbeddingView] + override val valueConv: Conv[Value] = ScroogeConv.fromStruct[SimClustersEmbedding] + + override val contactInfo: ContactInfo = ColumnConfigBase.contactInfo + + override val metadata: OpMetadata = OpMetadata( + lifecycle = Some(Lifecycle.Production), + description = Some(PlainText( + "The Topic SimClusters Embedding Endpoint in Representation Management Service with TopicId." + + " TDD: http://go/rms-tdd")) + ) + + override def fetch(key: Key, view: View): Stitch[Result[Value]] = { + val embeddingId = SimClustersEmbeddingId( + view.embeddingType, + view.modelVersion, + InternalId.TopicId(key) + ) + + storeStitch(embeddingId) + .map(embedding => found(embedding)) + .handle { + case stitch.NotFound => missing + } + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/BUILD new file mode 100644 index 000000000..26022ebe5 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/BUILD @@ -0,0 +1,14 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-core/src/main/scala", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/modules", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/store", + "representation-manager/server/src/main/thrift:thrift-scala", + "strato/src/main/scala/com/twitter/strato/fed", + "strato/src/main/scala/com/twitter/strato/fed/server", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/TweetSimClustersEmbeddingCol.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/TweetSimClustersEmbeddingCol.scala new file mode 100644 index 000000000..15cd4247c --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/tweet/TweetSimClustersEmbeddingCol.scala @@ -0,0 +1,73 @@ +package com.twitter.representation_manager.columns.tweet + +import com.twitter.representation_manager.columns.ColumnConfigBase +import com.twitter.representation_manager.store.TweetSimClustersEmbeddingStore +import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.stitch +import com.twitter.stitch.Stitch +import com.twitter.stitch.storehaus.StitchOfReadableStore +import com.twitter.strato.catalog.OpMetadata +import com.twitter.strato.config.AnyOf +import com.twitter.strato.config.ContactInfo +import com.twitter.strato.config.FromColumns +import com.twitter.strato.config.Policy +import com.twitter.strato.config.Prefix +import com.twitter.strato.data.Conv +import com.twitter.strato.data.Description.PlainText +import com.twitter.strato.data.Lifecycle +import com.twitter.strato.fed._ +import com.twitter.strato.thrift.ScroogeConv +import javax.inject.Inject + +class TweetSimClustersEmbeddingCol @Inject() (embeddingStore: TweetSimClustersEmbeddingStore) + extends StratoFed.Column("recommendations/representation_manager/simClustersEmbedding.Tweet") + with StratoFed.Fetch.Stitch { + + private val storeStitch: SimClustersEmbeddingId => Stitch[SimClustersEmbedding] = + StitchOfReadableStore(embeddingStore.tweetSimClustersEmbeddingStore.mapValues(_.toThrift)) + + val colPermissions: Seq[com.twitter.strato.config.Policy] = + ColumnConfigBase.recosPermissions ++ ColumnConfigBase.externalPermissions :+ FromColumns( + Set( + Prefix("ml/featureStore/simClusters"), + )) + + override val policy: Policy = AnyOf({ + colPermissions + }) + + override type Key = Long // TweetId + override type View = SimClustersEmbeddingView + override type Value = SimClustersEmbedding + + override val keyConv: Conv[Key] = Conv.long + override val viewConv: Conv[View] = ScroogeConv.fromStruct[SimClustersEmbeddingView] + override val valueConv: Conv[Value] = ScroogeConv.fromStruct[SimClustersEmbedding] + + override val contactInfo: ContactInfo = ColumnConfigBase.contactInfo + + override val metadata: OpMetadata = OpMetadata( + lifecycle = Some(Lifecycle.Production), + description = Some( + PlainText("The Tweet SimClusters Embedding Endpoint in Representation Management Service." + + " TDD: http://go/rms-tdd")) + ) + + override def fetch(key: Key, view: View): Stitch[Result[Value]] = { + val embeddingId = SimClustersEmbeddingId( + view.embeddingType, + view.modelVersion, + InternalId.TweetId(key) + ) + + storeStitch(embeddingId) + .map(embedding => found(embedding)) + .handle { + case stitch.NotFound => missing + } + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/BUILD new file mode 100644 index 000000000..26022ebe5 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/BUILD @@ -0,0 +1,14 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finatra/inject/inject-core/src/main/scala", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/columns", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/modules", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/store", + "representation-manager/server/src/main/thrift:thrift-scala", + "strato/src/main/scala/com/twitter/strato/fed", + "strato/src/main/scala/com/twitter/strato/fed/server", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/UserSimClustersEmbeddingCol.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/UserSimClustersEmbeddingCol.scala new file mode 100644 index 000000000..ebcf22a1d --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/columns/user/UserSimClustersEmbeddingCol.scala @@ -0,0 +1,73 @@ +package com.twitter.representation_manager.columns.user + +import com.twitter.representation_manager.columns.ColumnConfigBase +import com.twitter.representation_manager.store.UserSimClustersEmbeddingStore +import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.stitch +import com.twitter.stitch.Stitch +import com.twitter.stitch.storehaus.StitchOfReadableStore +import com.twitter.strato.catalog.OpMetadata +import com.twitter.strato.config.AnyOf +import com.twitter.strato.config.ContactInfo +import com.twitter.strato.config.FromColumns +import com.twitter.strato.config.Policy +import com.twitter.strato.config.Prefix +import com.twitter.strato.data.Conv +import com.twitter.strato.data.Description.PlainText +import com.twitter.strato.data.Lifecycle +import com.twitter.strato.fed._ +import com.twitter.strato.thrift.ScroogeConv +import javax.inject.Inject + +class UserSimClustersEmbeddingCol @Inject() (embeddingStore: UserSimClustersEmbeddingStore) + extends StratoFed.Column("recommendations/representation_manager/simClustersEmbedding.User") + with StratoFed.Fetch.Stitch { + + private val storeStitch: SimClustersEmbeddingId => Stitch[SimClustersEmbedding] = + StitchOfReadableStore(embeddingStore.userSimClustersEmbeddingStore.mapValues(_.toThrift)) + + val colPermissions: Seq[com.twitter.strato.config.Policy] = + ColumnConfigBase.recosPermissions ++ ColumnConfigBase.externalPermissions :+ FromColumns( + Set( + Prefix("ml/featureStore/simClusters"), + )) + + override val policy: Policy = AnyOf({ + colPermissions + }) + + override type Key = Long // UserId + override type View = SimClustersEmbeddingView + override type Value = SimClustersEmbedding + + override val keyConv: Conv[Key] = Conv.long + override val viewConv: Conv[View] = ScroogeConv.fromStruct[SimClustersEmbeddingView] + override val valueConv: Conv[Value] = ScroogeConv.fromStruct[SimClustersEmbedding] + + override val contactInfo: ContactInfo = ColumnConfigBase.contactInfo + + override val metadata: OpMetadata = OpMetadata( + lifecycle = Some(Lifecycle.Production), + description = Some( + PlainText("The User SimClusters Embedding Endpoint in Representation Management Service." + + " TDD: http://go/rms-tdd")) + ) + + override def fetch(key: Key, view: View): Stitch[Result[Value]] = { + val embeddingId = SimClustersEmbeddingId( + view.embeddingType, + view.modelVersion, + InternalId.UserId(key) + ) + + storeStitch(embeddingId) + .map(embedding => found(embedding)) + .handle { + case stitch.NotFound => missing + } + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/BUILD new file mode 100644 index 000000000..62b8f5dd2 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/BUILD @@ -0,0 +1,13 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "decider/src/main/scala", + "finagle/finagle-memcached", + "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", + "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection", + "src/scala/com/twitter/simclusters_v2/common", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/MemCacheConfig.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/MemCacheConfig.scala new file mode 100644 index 000000000..4741edb2d --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/MemCacheConfig.scala @@ -0,0 +1,153 @@ +package com.twitter.representation_manager.common + +import com.twitter.bijection.scrooge.BinaryScalaCodec +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.memcached.Client +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.hashing.KeyHasher +import com.twitter.hermit.store.common.ObservedMemcachedReadableStore +import com.twitter.relevance_platform.common.injection.LZ4Injection +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.common.SimClustersEmbeddingIdCacheKeyBuilder +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.simclusters_v2.thriftscala.ModelVersion._ +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storehaus.ReadableStore +import com.twitter.util.Duration + +/* + * NOTE - ALL the cache configs here are just placeholders, NONE of them is used anyweher in RMS yet + * */ +sealed trait MemCacheParams +sealed trait MemCacheConfig + +/* + * This holds params that is required to set up a memcache cache for a single embedding store + * */ +case class EnabledMemCacheParams(ttl: Duration) extends MemCacheParams +object DisabledMemCacheParams extends MemCacheParams + +/* + * We use this MemcacheConfig as the single source to set up the memcache for all RMS use cases + * NO OVERRIDE FROM CLIENT + * */ +object MemCacheConfig { + val keyHasher: KeyHasher = KeyHasher.FNV1A_64 + val hashKeyPrefix: String = "RMS" + val simclustersEmbeddingCacheKeyBuilder = + SimClustersEmbeddingIdCacheKeyBuilder(keyHasher.hashKey, hashKeyPrefix) + + val cacheParamsMap: Map[ + (EmbeddingType, ModelVersion), + MemCacheParams + ] = Map( + // Tweet Embeddings + (LogFavBasedTweet, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 10.minutes), + (LogFavBasedTweet, Model20m145k2020) -> EnabledMemCacheParams(ttl = 10.minutes), + (LogFavLongestL2EmbeddingTweet, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 10.minutes), + (LogFavLongestL2EmbeddingTweet, Model20m145k2020) -> EnabledMemCacheParams(ttl = 10.minutes), + // User - KnownFor Embeddings + (FavBasedProducer, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (FavBasedProducer, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FollowBasedProducer, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (AggregatableLogFavBasedProducer, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (RelaxedAggregatableLogFavBasedProducer, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = + 12.hours), + (RelaxedAggregatableLogFavBasedProducer, Model20m145k2020) -> EnabledMemCacheParams(ttl = + 12.hours), + // User - InterestedIn Embeddings + (LogFavBasedUserInterestedInFromAPE, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FollowBasedUserInterestedInFromAPE, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FavBasedUserInterestedIn, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (FavBasedUserInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FollowBasedUserInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (LogFavBasedUserInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FavBasedUserInterestedInFromPE, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (FilteredUserInterestedIn, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (FilteredUserInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (FilteredUserInterestedInFromPE, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (UnfilteredUserInterestedIn, Model20m145kUpdated) -> EnabledMemCacheParams(ttl = 12.hours), + (UnfilteredUserInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (UserNextInterestedIn, Model20m145k2020) -> EnabledMemCacheParams(ttl = + 30.minutes), //embedding is updated every 2 hours, keeping it lower to avoid staleness + ( + LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ( + LogFavBasedUserInterestedAverageAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ( + LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ( + LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ( + LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ( + LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + // Topic Embeddings + (FavTfgTopic, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + (LogFavBasedKgoApeTopic, Model20m145k2020) -> EnabledMemCacheParams(ttl = 12.hours), + ) + + def getCacheSetup( + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): MemCacheParams = { + // When requested (embeddingType, modelVersion) doesn't exist, we return DisabledMemCacheParams + cacheParamsMap.getOrElse((embeddingType, modelVersion), DisabledMemCacheParams) + } + + def getCacheKeyPrefix(embeddingType: EmbeddingType, modelVersion: ModelVersion) = + s"${embeddingType.value}_${modelVersion.value}_" + + def getStatsName(embeddingType: EmbeddingType, modelVersion: ModelVersion) = + s"${embeddingType.name}_${modelVersion.name}_mem_cache" + + /** + * Build a ReadableStore based on MemCacheConfig. + * + * If memcache is disabled, it will return a normal readable store wrapper of the rawStore, + * with SimClustersEmbedding as value; + * If memcache is enabled, it will return a ObservedMemcachedReadableStore wrapper of the rawStore, + * with memcache set up according to the EnabledMemCacheParams + * */ + def buildMemCacheStoreForSimClustersEmbedding( + rawStore: ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding], + cacheClient: Client, + embeddingType: EmbeddingType, + modelVersion: ModelVersion, + stats: StatsReceiver + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val cacheParams = getCacheSetup(embeddingType, modelVersion) + val store = cacheParams match { + case DisabledMemCacheParams => rawStore + case EnabledMemCacheParams(ttl) => + val memCacheKeyPrefix = MemCacheConfig.getCacheKeyPrefix( + embeddingType, + modelVersion + ) + val statsName = MemCacheConfig.getStatsName( + embeddingType, + modelVersion + ) + ObservedMemcachedReadableStore.fromCacheClient( + backingStore = rawStore, + cacheClient = cacheClient, + ttl = ttl + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = stats.scope(statsName), + keyToString = { k => memCacheKeyPrefix + k.toString } + ) + } + store.mapValues(SimClustersEmbedding(_)) + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/RepresentationManagerDecider.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/RepresentationManagerDecider.scala new file mode 100644 index 000000000..97179e25f --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/common/RepresentationManagerDecider.scala @@ -0,0 +1,25 @@ +package com.twitter.representation_manager.common + +import com.twitter.decider.Decider +import com.twitter.decider.RandomRecipient +import com.twitter.decider.Recipient +import com.twitter.simclusters_v2.common.DeciderGateBuilderWithIdHashing +import javax.inject.Inject + +case class RepresentationManagerDecider @Inject() (decider: Decider) { + + val deciderGateBuilder = new DeciderGateBuilderWithIdHashing(decider) + + def isAvailable(feature: String, recipient: Option[Recipient]): Boolean = { + decider.isAvailable(feature, recipient) + } + + /** + * When useRandomRecipient is set to false, the decider is either completely on or off. + * When useRandomRecipient is set to true, the decider is on for the specified % of traffic. + */ + def isAvailable(feature: String, useRandomRecipient: Boolean = true): Boolean = { + if (useRandomRecipient) isAvailable(feature, Some(RandomRecipient)) + else isAvailable(feature, None) + } +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/BUILD new file mode 100644 index 000000000..d8bf04fc0 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/BUILD @@ -0,0 +1,25 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "content-recommender/server/src/main/scala/com/twitter/contentrecommender:representation-manager-deps", + "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/store/strato", + "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", + "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", + "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection", + "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/readablestore", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/common", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/store", + "src/scala/com/twitter/ml/api/embedding", + "src/scala/com/twitter/simclusters_v2/common", + "src/scala/com/twitter/simclusters_v2/score", + "src/scala/com/twitter/simclusters_v2/summingbird/stores", + "src/scala/com/twitter/storehaus_internal/manhattan", + "src/scala/com/twitter/storehaus_internal/util", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + "src/thrift/com/twitter/socialgraph:thrift-scala", + "storage/clients/manhattan/client/src/main/scala", + "tweetypie/src/scala/com/twitter/tweetypie/util", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/LegacyRMS.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/LegacyRMS.scala new file mode 100644 index 000000000..378f33594 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/migration/LegacyRMS.scala @@ -0,0 +1,846 @@ +package com.twitter.representation_manager.migration + +import com.twitter.bijection.Injection +import com.twitter.bijection.scrooge.BinaryScalaCodec +import com.twitter.contentrecommender.store.ApeEntityEmbeddingStore +import com.twitter.contentrecommender.store.InterestsOptOutStore +import com.twitter.contentrecommender.store.SemanticCoreTopicSeedStore +import com.twitter.contentrecommender.twistly +import com.twitter.conversions.DurationOps._ +import com.twitter.decider.Decider +import com.twitter.escherbird.util.uttclient.CacheConfigV2 +import com.twitter.escherbird.util.uttclient.CachedUttClientV2 +import com.twitter.escherbird.util.uttclient.UttClientCacheConfigsV2 +import com.twitter.escherbird.utt.strato.thriftscala.Environment +import com.twitter.finagle.ThriftMux +import com.twitter.finagle.memcached.Client +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.mtls.client.MtlsStackClient.MtlsThriftMuxClientSyntax +import com.twitter.finagle.mux.ClientDiscardedRequestException +import com.twitter.finagle.service.ReqRep +import com.twitter.finagle.service.ResponseClass +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.finagle.thrift.ClientId +import com.twitter.frigate.common.store.strato.StratoFetchableStore +import com.twitter.frigate.common.util.SeqLongInjection +import com.twitter.hashing.KeyHasher +import com.twitter.hermit.store.common.DeciderableReadableStore +import com.twitter.hermit.store.common.ObservedCachedReadableStore +import com.twitter.hermit.store.common.ObservedMemcachedReadableStore +import com.twitter.hermit.store.common.ObservedReadableStore +import com.twitter.interests.thriftscala.InterestsThriftService +import com.twitter.relevance_platform.common.injection.LZ4Injection +import com.twitter.relevance_platform.common.readablestore.ReadableStoreWithTimeout +import com.twitter.representation_manager.common.RepresentationManagerDecider +import com.twitter.representation_manager.store.DeciderConstants +import com.twitter.representation_manager.store.DeciderKey +import com.twitter.simclusters_v2.common.ModelVersions +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.common.SimClustersEmbeddingIdCacheKeyBuilder +import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore +import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore +import com.twitter.simclusters_v2.summingbird.stores.ProducerClusterEmbeddingReadableStores +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore +import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.simclusters_v2.thriftscala.ModelVersion.Model20m145k2020 +import com.twitter.simclusters_v2.thriftscala.ModelVersion.Model20m145kUpdated +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbedding +import com.twitter.simclusters_v2.thriftscala.SimClustersMultiEmbeddingId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams +import com.twitter.storehaus.ReadableStore +import com.twitter.storehaus_internal.manhattan.Athena +import com.twitter.storehaus_internal.manhattan.ManhattanRO +import com.twitter.storehaus_internal.manhattan.ManhattanROConfig +import com.twitter.storehaus_internal.util.ApplicationID +import com.twitter.storehaus_internal.util.DatasetName +import com.twitter.storehaus_internal.util.HDFSPath +import com.twitter.strato.client.Strato +import com.twitter.strato.client.{Client => StratoClient} +import com.twitter.strato.thrift.ScroogeConvImplicits._ +import com.twitter.tweetypie.util.UserId +import com.twitter.util.Duration +import com.twitter.util.Future +import com.twitter.util.Throw +import com.twitter.util.Timer +import javax.inject.Inject +import javax.inject.Named +import scala.reflect.ClassTag + +class LegacyRMS @Inject() ( + serviceIdentifier: ServiceIdentifier, + cacheClient: Client, + stats: StatsReceiver, + decider: Decider, + clientId: ClientId, + timer: Timer, + @Named("cacheHashKeyPrefix") val cacheHashKeyPrefix: String = "RMS", + @Named("useContentRecommenderConfiguration") val useContentRecommenderConfiguration: Boolean = + false) { + + private val mhMtlsParams: ManhattanKVClientMtlsParams = ManhattanKVClientMtlsParams( + serviceIdentifier) + private val rmsDecider = RepresentationManagerDecider(decider) + val keyHasher: KeyHasher = KeyHasher.FNV1A_64 + + private val embeddingCacheKeyBuilder = + SimClustersEmbeddingIdCacheKeyBuilder(keyHasher.hashKey, cacheHashKeyPrefix) + private val statsReceiver = stats.scope("representation_management") + + // Strato client, default timeout = 280ms + val stratoClient: StratoClient = + Strato.client + .withMutualTls(serviceIdentifier) + .build() + + // Builds ThriftMux client builder for Content-Recommender service + private def makeThriftClientBuilder( + requestTimeout: Duration + ): ThriftMux.Client = { + ThriftMux.client + .withClientId(clientId) + .withMutualTls(serviceIdentifier) + .withRequestTimeout(requestTimeout) + .withStatsReceiver(statsReceiver.scope("clnt")) + .withResponseClassifier { + case ReqRep(_, Throw(_: ClientDiscardedRequestException)) => ResponseClass.Ignorable + } + } + + private def makeThriftClient[ThriftServiceType: ClassTag]( + dest: String, + label: String, + requestTimeout: Duration = 450.milliseconds + ): ThriftServiceType = { + makeThriftClientBuilder(requestTimeout) + .build[ThriftServiceType](dest, label) + } + + /** *** SimCluster Embedding Stores ******/ + implicit val simClustersEmbeddingIdInjection: Injection[SimClustersEmbeddingId, Array[Byte]] = + BinaryScalaCodec(SimClustersEmbeddingId) + implicit val simClustersEmbeddingInjection: Injection[ThriftSimClustersEmbedding, Array[Byte]] = + BinaryScalaCodec(ThriftSimClustersEmbedding) + implicit val simClustersMultiEmbeddingInjection: Injection[SimClustersMultiEmbedding, Array[ + Byte + ]] = + BinaryScalaCodec(SimClustersMultiEmbedding) + implicit val simClustersMultiEmbeddingIdInjection: Injection[SimClustersMultiEmbeddingId, Array[ + Byte + ]] = + BinaryScalaCodec(SimClustersMultiEmbeddingId) + + def getEmbeddingsDataset( + mhMtlsParams: ManhattanKVClientMtlsParams, + datasetName: String + ): ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding] = { + ManhattanRO.getReadableStoreWithMtls[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + ManhattanROConfig( + HDFSPath(""), // not needed + ApplicationID("content_recommender_athena"), + DatasetName(datasetName), // this should be correct + Athena + ), + mhMtlsParams + ) + } + + lazy val logFavBasedLongestL2Tweet20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .longestL2NormTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, + statsReceiver, + maxLength = 10, + ).mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore.fromCacheClient( + backingStore = rawStore, + cacheClient = cacheClient, + ttl = 15.minutes + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = + statsReceiver.scope("log_fav_based_longest_l2_tweet_embedding_20m145k2020_mem_cache"), + keyToString = { k => + s"scez_l2:${LogFavBasedTweet}_${ModelVersions.Model20M145K2020}_$k" + } + ) + + val inMemoryCacheStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = + memcachedStore + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + LogFavLongestL2EmbeddingTweet, + Model20m145k2020, + InternalId.TweetId(tweetId)) => + tweetId + } + .mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + inMemoryCacheStore, + ttl = 12.minute, + maxKeys = 1048575, + cacheName = "log_fav_based_longest_l2_tweet_embedding_20m145k2020_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_longest_l2_tweet_embedding_20m145k2020_store")) + } + + lazy val logFavBased20M145KUpdatedTweetEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .mostRecentTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset, + statsReceiver + ).mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore.fromCacheClient( + backingStore = rawStore, + cacheClient = cacheClient, + ttl = 10.minutes + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("log_fav_based_tweet_embedding_mem_cache"), + keyToString = { k => + // SimClusters_embedding_LZ4/embeddingType_modelVersion_tweetId + s"scez:${LogFavBasedTweet}_${ModelVersions.Model20M145KUpdated}_$k" + } + ) + + val inMemoryCacheStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + memcachedStore + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + LogFavBasedTweet, + Model20m145kUpdated, + InternalId.TweetId(tweetId)) => + tweetId + } + .mapValues(SimClustersEmbedding(_)) + } + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + inMemoryCacheStore, + ttl = 5.minute, + maxKeys = 1048575, // 200MB + cacheName = "log_fav_based_tweet_embedding_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_tweet_embedding_store")) + } + + lazy val logFavBased20M145K2020TweetEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .mostRecentTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, + statsReceiver, + maxLength = 10, + ).mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore.fromCacheClient( + backingStore = rawStore, + cacheClient = cacheClient, + ttl = 15.minutes + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("log_fav_based_tweet_embedding_20m145k2020_mem_cache"), + keyToString = { k => + // SimClusters_embedding_LZ4/embeddingType_modelVersion_tweetId + s"scez:${LogFavBasedTweet}_${ModelVersions.Model20M145K2020}_$k" + } + ) + + val inMemoryCacheStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = + memcachedStore + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + LogFavBasedTweet, + Model20m145k2020, + InternalId.TweetId(tweetId)) => + tweetId + } + .mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + inMemoryCacheStore, + ttl = 12.minute, + maxKeys = 16777215, + cacheName = "log_fav_based_tweet_embedding_20m145k2020_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_tweet_embedding_20m145k2020_store")) + } + + lazy val favBasedTfgTopicEmbedding2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val stratoStore = + StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/favBasedTFGTopic20M145K2020") + + val truncatedStore = stratoStore.mapValues { embedding => + SimClustersEmbedding(embedding, truncate = 50) + } + + ObservedCachedReadableStore.from( + ObservedReadableStore(truncatedStore)( + statsReceiver.scope("fav_tfg_topic_embedding_2020_cache_backing_store")), + ttl = 12.hours, + maxKeys = 262143, // 200MB + cacheName = "fav_tfg_topic_embedding_2020_cache", + windowSize = 10000L + )(statsReceiver.scope("fav_tfg_topic_embedding_2020_cache")) + } + + lazy val logFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + ObservedReadableStore( + StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020") + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + AggregatableLogFavBasedProducer, + Model20m145k2020, + internalId) => + SimClustersEmbeddingId(AggregatableLogFavBasedProducer, Model20m145k2020, internalId) + } + .mapValues(embedding => SimClustersEmbedding(embedding, 50)) + )(statsReceiver.scope("aggregatable_producer_embeddings_by_logfav_score_2020")) + } + + val interestService: InterestsThriftService.MethodPerEndpoint = + makeThriftClient[InterestsThriftService.MethodPerEndpoint]( + "/s/interests-thrift-service/interests-thrift-service", + "interests_thrift_service" + ) + + val interestsOptOutStore: InterestsOptOutStore = InterestsOptOutStore(interestService) + + // Save 2 ^ 18 UTTs. Promising 100% cache rate + lazy val defaultCacheConfigV2: CacheConfigV2 = CacheConfigV2(262143) + lazy val uttClientCacheConfigsV2: UttClientCacheConfigsV2 = UttClientCacheConfigsV2( + getTaxonomyConfig = defaultCacheConfigV2, + getUttTaxonomyConfig = defaultCacheConfigV2, + getLeafIds = defaultCacheConfigV2, + getLeafUttEntities = defaultCacheConfigV2 + ) + + // CachedUttClient to use StratoClient + lazy val cachedUttClientV2: CachedUttClientV2 = new CachedUttClientV2( + stratoClient = stratoClient, + env = Environment.Prod, + cacheConfigs = uttClientCacheConfigsV2, + statsReceiver = statsReceiver.scope("cached_utt_client") + ) + + lazy val semanticCoreTopicSeedStore: ReadableStore[ + SemanticCoreTopicSeedStore.Key, + Seq[UserId] + ] = { + /* + Up to 1000 Long seeds per topic/language = 62.5kb per topic/language (worst case) + Assume ~10k active topic/languages ~= 650MB (worst case) + */ + val underlying = new SemanticCoreTopicSeedStore(cachedUttClientV2, interestsOptOutStore)( + statsReceiver.scope("semantic_core_topic_seed_store")) + + val memcacheStore = ObservedMemcachedReadableStore.fromCacheClient( + backingStore = underlying, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = SeqLongInjection, + statsReceiver = statsReceiver.scope("topic_producer_seed_store_mem_cache"), + keyToString = { k => s"tpss:${k.entityId}_${k.languageCode}" } + ) + + ObservedCachedReadableStore.from[SemanticCoreTopicSeedStore.Key, Seq[UserId]]( + store = memcacheStore, + ttl = 6.hours, + maxKeys = 20e3.toInt, + cacheName = "topic_producer_seed_store_cache", + windowSize = 5000 + )(statsReceiver.scope("topic_producer_seed_store_cache")) + } + + lazy val logFavBasedApeEntity20M145K2020EmbeddingStore: ApeEntityEmbeddingStore = { + val apeStore = logFavBasedApe20M145K2020EmbeddingStore.composeKeyMapping[UserId]({ id => + SimClustersEmbeddingId( + AggregatableLogFavBasedProducer, + Model20m145k2020, + InternalId.UserId(id)) + }) + + new ApeEntityEmbeddingStore( + semanticCoreSeedStore = semanticCoreTopicSeedStore, + aggregatableProducerEmbeddingStore = apeStore, + statsReceiver = statsReceiver.scope("log_fav_based_ape_entity_2020_embedding_store")) + } + + lazy val logFavBasedApeEntity20M145K2020EmbeddingCachedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val truncatedStore = + logFavBasedApeEntity20M145K2020EmbeddingStore.mapValues(_.truncate(50).toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = truncatedStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("log_fav_based_ape_entity_2020_embedding_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + val inMemoryCachedStore = + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "log_fav_based_ape_entity_2020_embedding_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_ape_entity_2020_embedding_cached_store")) + + DeciderableReadableStore( + inMemoryCachedStore, + rmsDecider.deciderGateBuilder.idGateWithHashing[SimClustersEmbeddingId]( + DeciderKey.enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore), + statsReceiver.scope("log_fav_based_ape_entity_2020_embedding_deciderable_store") + ) + } + + lazy val relaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + ObservedReadableStore( + StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/logFavBasedAPERelaxedFavEngagementThreshold20M145K2020") + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020, + internalId) => + SimClustersEmbeddingId( + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020, + internalId) + } + .mapValues(embedding => SimClustersEmbedding(embedding).truncate(50)) + )(statsReceiver.scope( + "aggregatable_producer_embeddings_by_logfav_score_relaxed_fav_engagement_threshold_2020")) + } + + lazy val relaxedLogFavBasedApe20M145K2020EmbeddingCachedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val truncatedStore = + relaxedLogFavBasedApe20M145K2020EmbeddingStore.mapValues(_.truncate(50).toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = truncatedStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = + statsReceiver.scope("relaxed_log_fav_based_ape_entity_2020_embedding_mem_cache"), + keyToString = { k: SimClustersEmbeddingId => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "relaxed_log_fav_based_ape_entity_2020_embedding_cache", + windowSize = 10000L + )(statsReceiver.scope("relaxed_log_fav_based_ape_entity_2020_embedding_cache_store")) + } + + lazy val favBasedProducer20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = ProducerClusterEmbeddingReadableStores + .getProducerTopKSimClusters2020EmbeddingsStore( + mhMtlsParams + ).composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + FavBasedProducer, + Model20m145k2020, + InternalId.UserId(userId)) => + userId + }.mapValues { topSimClustersWithScore => + ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters.take(10)) + } + + // same memcache config as for favBasedUserInterestedIn20M145K2020Store + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 24.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("fav_based_producer_embedding_20M_145K_2020_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 12.hours, + maxKeys = 16777215, + cacheName = "fav_based_producer_embedding_20M_145K_2020_embedding_cache", + windowSize = 10000L + )(statsReceiver.scope("fav_based_producer_embedding_20M_145K_2020_embedding_store")) + } + + // Production + lazy val interestedIn20M145KUpdatedStore: ReadableStore[UserId, ClustersUserIsInterestedIn] = { + UserInterestedInReadableStore.defaultStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145KUpdated + ) + } + + // Production + lazy val interestedIn20M145K2020Store: ReadableStore[UserId, ClustersUserIsInterestedIn] = { + UserInterestedInReadableStore.defaultStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145K2020 + ) + } + + // Production + lazy val InterestedInFromPE20M145KUpdatedStore: ReadableStore[ + UserId, + ClustersUserIsInterestedIn + ] = { + UserInterestedInReadableStore.defaultIIPEStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145KUpdated) + } + + lazy val simClustersInterestedInStore: ReadableStore[ + (UserId, ModelVersion), + ClustersUserIsInterestedIn + ] = { + new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { + override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { + k match { + case (userId, Model20m145kUpdated) => + interestedIn20M145KUpdatedStore.get(userId) + case (userId, Model20m145k2020) => + interestedIn20M145K2020Store.get(userId) + case _ => + Future.None + } + } + } + } + + lazy val simClustersInterestedInFromProducerEmbeddingsStore: ReadableStore[ + (UserId, ModelVersion), + ClustersUserIsInterestedIn + ] = { + new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { + override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { + k match { + case (userId, ModelVersion.Model20m145kUpdated) => + InterestedInFromPE20M145KUpdatedStore.get(userId) + case _ => + Future.None + } + } + } + } + + lazy val userInterestedInStore = + new twistly.interestedin.EmbeddingStore( + interestedInStore = simClustersInterestedInStore, + interestedInFromProducerEmbeddingStore = simClustersInterestedInFromProducerEmbeddingsStore, + statsReceiver = statsReceiver + ) + + // Production + lazy val favBasedUserInterestedIn20M145KUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = + UserInterestedInReadableStore + .defaultSimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.FavBasedUserInterestedIn, + ModelVersion.Model20m145kUpdated) + .mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("fav_based_user_interested_in_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "fav_based_user_interested_in_cache", + windowSize = 10000L + )(statsReceiver.scope("fav_based_user_interested_in_store")) + } + + // Production + lazy val LogFavBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = + UserInterestedInReadableStore + .defaultIIAPESimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.LogFavBasedUserInterestedInFromAPE, + ModelVersion.Model20m145k2020) + .mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("log_fav_based_user_interested_in_from_ape_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "log_fav_based_user_interested_in_from_ape_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_user_interested_in_from_ape_store")) + } + + // Production + lazy val FollowBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = + UserInterestedInReadableStore + .defaultIIAPESimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.FollowBasedUserInterestedInFromAPE, + ModelVersion.Model20m145k2020) + .mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("follow_based_user_interested_in_from_ape_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "follow_based_user_interested_in_from_ape_cache", + windowSize = 10000L + )(statsReceiver.scope("follow_based_user_interested_in_from_ape_store")) + } + + // production + lazy val favBasedUserInterestedIn20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore: ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding] = + UserInterestedInReadableStore + .defaultSimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.FavBasedUserInterestedIn, + ModelVersion.Model20m145k2020).mapValues(_.toThrift) + + ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("fav_based_user_interested_in_2020_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + } + + // Production + lazy val logFavBasedUserInterestedIn20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = + UserInterestedInReadableStore + .defaultSimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.LogFavBasedUserInterestedIn, + ModelVersion.Model20m145k2020) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore.mapValues(_.toThrift), + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("log_fav_based_user_interested_in_2020_store"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "log_fav_based_user_interested_in_2020_cache", + windowSize = 10000L + )(statsReceiver.scope("log_fav_based_user_interested_in_2020_store")) + } + + // Production + lazy val favBasedUserInterestedInFromPE20M145KUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val underlyingStore = + UserInterestedInReadableStore + .defaultIIPESimClustersEmbeddingStoreWithMtls( + mhMtlsParams, + EmbeddingType.FavBasedUserInterestedInFromPE, + ModelVersion.Model20m145kUpdated) + .mapValues(_.toThrift) + + val memcachedStore = ObservedMemcachedReadableStore + .fromCacheClient( + backingStore = underlyingStore, + cacheClient = cacheClient, + ttl = 12.hours + )( + valueInjection = LZ4Injection.compose(BinaryScalaCodec(ThriftSimClustersEmbedding)), + statsReceiver = statsReceiver.scope("fav_based_user_interested_in_from_pe_mem_cache"), + keyToString = { k => embeddingCacheKeyBuilder.apply(k) } + ).mapValues(SimClustersEmbedding(_)) + + ObservedCachedReadableStore.from[SimClustersEmbeddingId, SimClustersEmbedding]( + memcachedStore, + ttl = 6.hours, + maxKeys = 262143, + cacheName = "fav_based_user_interested_in_from_pe_cache", + windowSize = 10000L + )(statsReceiver.scope("fav_based_user_interested_in_from_pe_cache")) + } + + private val underlyingStores: Map[ + (EmbeddingType, ModelVersion), + ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] + ] = Map( + // Tweet Embeddings + (LogFavBasedTweet, Model20m145kUpdated) -> logFavBased20M145KUpdatedTweetEmbeddingStore, + (LogFavBasedTweet, Model20m145k2020) -> logFavBased20M145K2020TweetEmbeddingStore, + ( + LogFavLongestL2EmbeddingTweet, + Model20m145k2020) -> logFavBasedLongestL2Tweet20M145K2020EmbeddingStore, + // Entity Embeddings + (FavTfgTopic, Model20m145k2020) -> favBasedTfgTopicEmbedding2020Store, + ( + LogFavBasedKgoApeTopic, + Model20m145k2020) -> logFavBasedApeEntity20M145K2020EmbeddingCachedStore, + // KnownFor Embeddings + (FavBasedProducer, Model20m145k2020) -> favBasedProducer20M145K2020EmbeddingStore, + ( + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020) -> relaxedLogFavBasedApe20M145K2020EmbeddingCachedStore, + // InterestedIn Embeddings + ( + LogFavBasedUserInterestedInFromAPE, + Model20m145k2020) -> LogFavBasedInterestedInFromAPE20M145K2020Store, + ( + FollowBasedUserInterestedInFromAPE, + Model20m145k2020) -> FollowBasedInterestedInFromAPE20M145K2020Store, + (FavBasedUserInterestedIn, Model20m145kUpdated) -> favBasedUserInterestedIn20M145KUpdatedStore, + (FavBasedUserInterestedIn, Model20m145k2020) -> favBasedUserInterestedIn20M145K2020Store, + (LogFavBasedUserInterestedIn, Model20m145k2020) -> logFavBasedUserInterestedIn20M145K2020Store, + ( + FavBasedUserInterestedInFromPE, + Model20m145kUpdated) -> favBasedUserInterestedInFromPE20M145KUpdatedStore, + (FilteredUserInterestedIn, Model20m145kUpdated) -> userInterestedInStore, + (FilteredUserInterestedIn, Model20m145k2020) -> userInterestedInStore, + (FilteredUserInterestedInFromPE, Model20m145kUpdated) -> userInterestedInStore, + (UnfilteredUserInterestedIn, Model20m145kUpdated) -> userInterestedInStore, + (UnfilteredUserInterestedIn, Model20m145k2020) -> userInterestedInStore, + ) + + val simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val underlying: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = + SimClustersEmbeddingStore.buildWithDecider( + underlyingStores = underlyingStores, + decider = rmsDecider.decider, + statsReceiver = statsReceiver.scope("simClusters_embeddings_store_deciderable") + ) + + val underlyingWithTimeout: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = + new ReadableStoreWithTimeout( + rs = underlying, + decider = rmsDecider.decider, + enableTimeoutDeciderKey = DeciderConstants.enableSimClustersEmbeddingStoreTimeouts, + timeoutValueKey = DeciderConstants.simClustersEmbeddingStoreTimeoutValueMillis, + timer = timer, + statsReceiver = statsReceiver.scope("simClusters_embedding_store_timeouts") + ) + + ObservedReadableStore( + store = underlyingWithTimeout + )(statsReceiver.scope("simClusters_embeddings_store")) + } +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/BUILD new file mode 100644 index 000000000..ab19a1dd7 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/BUILD @@ -0,0 +1,18 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", + "finagle/finagle-stats", + "finatra/inject/inject-core/src/main/scala", + "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", + "interests-service/thrift/src/main/thrift:thrift-scala", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/common", + "servo/util", + "src/scala/com/twitter/storehaus_internal/manhattan", + "src/scala/com/twitter/storehaus_internal/memcache", + "src/scala/com/twitter/storehaus_internal/util", + "strato/src/main/scala/com/twitter/strato/client", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/CacheModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/CacheModule.scala new file mode 100644 index 000000000..a042225fa --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/CacheModule.scala @@ -0,0 +1,34 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import com.twitter.finagle.memcached.Client +import javax.inject.Singleton +import com.twitter.conversions.DurationOps._ +import com.twitter.inject.TwitterModule +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.storehaus_internal.memcache.MemcacheStore +import com.twitter.storehaus_internal.util.ClientName +import com.twitter.storehaus_internal.util.ZkEndPoint + +object CacheModule extends TwitterModule { + + private val cacheDest = flag[String]("cache_module.dest", "Path to memcache service") + private val timeout = flag[Int]("memcache.timeout", "Memcache client timeout") + private val retries = flag[Int]("memcache.retries", "Memcache timeout retries") + + @Singleton + @Provides + def providesCache( + serviceIdentifier: ServiceIdentifier, + stats: StatsReceiver + ): Client = + MemcacheStore.memcachedClient( + name = ClientName("memcache_representation_manager"), + dest = ZkEndPoint(cacheDest()), + timeout = timeout().milliseconds, + retries = retries(), + statsReceiver = stats.scope("cache_client"), + serviceIdentifier = serviceIdentifier + ) +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/InterestsThriftClientModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/InterestsThriftClientModule.scala new file mode 100644 index 000000000..82a5a5004 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/InterestsThriftClientModule.scala @@ -0,0 +1,40 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.ThriftMux +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.mtls.client.MtlsStackClient.MtlsThriftMuxClientSyntax +import com.twitter.finagle.mux.ClientDiscardedRequestException +import com.twitter.finagle.service.ReqRep +import com.twitter.finagle.service.ResponseClass +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.finagle.thrift.ClientId +import com.twitter.inject.TwitterModule +import com.twitter.interests.thriftscala.InterestsThriftService +import com.twitter.util.Throw +import javax.inject.Singleton + +object InterestsThriftClientModule extends TwitterModule { + + @Singleton + @Provides + def providesInterestsThriftClient( + clientId: ClientId, + serviceIdentifier: ServiceIdentifier, + statsReceiver: StatsReceiver + ): InterestsThriftService.MethodPerEndpoint = { + ThriftMux.client + .withClientId(clientId) + .withMutualTls(serviceIdentifier) + .withRequestTimeout(450.milliseconds) + .withStatsReceiver(statsReceiver.scope("InterestsThriftClient")) + .withResponseClassifier { + case ReqRep(_, Throw(_: ClientDiscardedRequestException)) => ResponseClass.Ignorable + } + .build[InterestsThriftService.MethodPerEndpoint]( + dest = "/s/interests-thrift-service/interests-thrift-service", + label = "interests_thrift_service" + ) + } +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/LegacyRMSConfigModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/LegacyRMSConfigModule.scala new file mode 100644 index 000000000..0a06dffe6 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/LegacyRMSConfigModule.scala @@ -0,0 +1,18 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import com.twitter.inject.TwitterModule +import javax.inject.Named +import javax.inject.Singleton + +object LegacyRMSConfigModule extends TwitterModule { + @Singleton + @Provides + @Named("cacheHashKeyPrefix") + def providesCacheHashKeyPrefix: String = "RMS" + + @Singleton + @Provides + @Named("useContentRecommenderConfiguration") + def providesUseContentRecommenderConfiguration: Boolean = false +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/StoreModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/StoreModule.scala new file mode 100644 index 000000000..a2efe5925 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/StoreModule.scala @@ -0,0 +1,24 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import javax.inject.Singleton +import com.twitter.inject.TwitterModule +import com.twitter.decider.Decider +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.representation_manager.common.RepresentationManagerDecider +import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams + +object StoreModule extends TwitterModule { + @Singleton + @Provides + def providesMhMtlsParams( + serviceIdentifier: ServiceIdentifier + ): ManhattanKVClientMtlsParams = ManhattanKVClientMtlsParams(serviceIdentifier) + + @Singleton + @Provides + def providesRmsDecider( + decider: Decider + ): RepresentationManagerDecider = RepresentationManagerDecider(decider) + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/TimerModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/TimerModule.scala new file mode 100644 index 000000000..fe7fddb45 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/TimerModule.scala @@ -0,0 +1,13 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import com.twitter.finagle.util.DefaultTimer +import com.twitter.inject.TwitterModule +import com.twitter.util.Timer +import javax.inject.Singleton + +object TimerModule extends TwitterModule { + @Singleton + @Provides + def providesTimer: Timer = DefaultTimer +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/UttClientModule.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/UttClientModule.scala new file mode 100644 index 000000000..cc2100c1c --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/modules/UttClientModule.scala @@ -0,0 +1,39 @@ +package com.twitter.representation_manager.modules + +import com.google.inject.Provides +import com.twitter.escherbird.util.uttclient.CacheConfigV2 +import com.twitter.escherbird.util.uttclient.CachedUttClientV2 +import com.twitter.escherbird.util.uttclient.UttClientCacheConfigsV2 +import com.twitter.escherbird.utt.strato.thriftscala.Environment +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.inject.TwitterModule +import com.twitter.strato.client.{Client => StratoClient} +import javax.inject.Singleton + +object UttClientModule extends TwitterModule { + + @Singleton + @Provides + def providesUttClient( + stratoClient: StratoClient, + statsReceiver: StatsReceiver + ): CachedUttClientV2 = { + // Save 2 ^ 18 UTTs. Promising 100% cache rate + val defaultCacheConfigV2: CacheConfigV2 = CacheConfigV2(262143) + + val uttClientCacheConfigsV2: UttClientCacheConfigsV2 = UttClientCacheConfigsV2( + getTaxonomyConfig = defaultCacheConfigV2, + getUttTaxonomyConfig = defaultCacheConfigV2, + getLeafIds = defaultCacheConfigV2, + getLeafUttEntities = defaultCacheConfigV2 + ) + + // CachedUttClient to use StratoClient + new CachedUttClientV2( + stratoClient = stratoClient, + env = Environment.Prod, + cacheConfigs = uttClientCacheConfigsV2, + statsReceiver = statsReceiver.scope("cached_utt_client") + ) + } +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/BUILD b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/BUILD new file mode 100644 index 000000000..1731a2649 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/BUILD @@ -0,0 +1,16 @@ +scala_library( + compiler_option_sets = ["fatal_warnings"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "content-recommender/server/src/main/scala/com/twitter/contentrecommender:representation-manager-deps", + "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", + "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", + "representation-manager/server/src/main/scala/com/twitter/representation_manager/common", + "src/scala/com/twitter/simclusters_v2/stores", + "src/scala/com/twitter/simclusters_v2/summingbird/stores", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + "storage/clients/manhattan/client/src/main/scala", + "tweetypie/src/scala/com/twitter/tweetypie/util", + ], +) diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala new file mode 100644 index 000000000..dd00ea126 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala @@ -0,0 +1,39 @@ +package com.twitter.representation_manager.store + +import com.twitter.servo.decider.DeciderKeyEnum + +object DeciderConstants { + // Deciders inherited from CR and RSX and only used in LegacyRMS + // Their value are manipulated by CR and RSX's yml file and their decider dashboard + // We will remove them after migration completed + val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore = + "enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore" + + val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore = + "enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore" + + val enablelogFavBased20M145K2020TweetEmbeddingStoreTimeouts = + "enable_log_fav_based_tweet_embedding_20m145k2020_timeouts" + val logFavBased20M145K2020TweetEmbeddingStoreTimeoutValueMillis = + "log_fav_based_tweet_embedding_20m145k2020_timeout_value_millis" + + val enablelogFavBased20M145KUpdatedTweetEmbeddingStoreTimeouts = + "enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts" + val logFavBased20M145KUpdatedTweetEmbeddingStoreTimeoutValueMillis = + "log_fav_based_tweet_embedding_20m145kUpdated_timeout_value_millis" + + val enableSimClustersEmbeddingStoreTimeouts = "enable_sim_clusters_embedding_store_timeouts" + val simClustersEmbeddingStoreTimeoutValueMillis = + "sim_clusters_embedding_store_timeout_value_millis" +} + +// Necessary for using servo Gates +object DeciderKey extends DeciderKeyEnum { + val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore: Value = Value( + DeciderConstants.enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore + ) + + val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore: Value = Value( + DeciderConstants.enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore + ) +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala new file mode 100644 index 000000000..cc6485b79 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala @@ -0,0 +1,198 @@ +package com.twitter.representation_manager.store + +import com.twitter.contentrecommender.store.ApeEntityEmbeddingStore +import com.twitter.contentrecommender.store.InterestsOptOutStore +import com.twitter.contentrecommender.store.SemanticCoreTopicSeedStore +import com.twitter.conversions.DurationOps._ +import com.twitter.escherbird.util.uttclient.CachedUttClientV2 +import com.twitter.finagle.memcached.Client +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.frigate.common.store.strato.StratoFetchableStore +import com.twitter.frigate.common.util.SeqLongInjection +import com.twitter.hermit.store.common.ObservedCachedReadableStore +import com.twitter.hermit.store.common.ObservedMemcachedReadableStore +import com.twitter.hermit.store.common.ObservedReadableStore +import com.twitter.interests.thriftscala.InterestsThriftService +import com.twitter.representation_manager.common.MemCacheConfig +import com.twitter.representation_manager.common.RepresentationManagerDecider +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.simclusters_v2.thriftscala.ModelVersion._ +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.TopicId +import com.twitter.simclusters_v2.thriftscala.LocaleEntityId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams +import com.twitter.storehaus.ReadableStore +import com.twitter.strato.client.{Client => StratoClient} +import com.twitter.tweetypie.util.UserId +import javax.inject.Inject + +class TopicSimClustersEmbeddingStore @Inject() ( + stratoClient: StratoClient, + cacheClient: Client, + globalStats: StatsReceiver, + mhMtlsParams: ManhattanKVClientMtlsParams, + rmsDecider: RepresentationManagerDecider, + interestService: InterestsThriftService.MethodPerEndpoint, + uttClient: CachedUttClientV2) { + + private val stats = globalStats.scope(this.getClass.getSimpleName) + private val interestsOptOutStore = InterestsOptOutStore(interestService) + + /** + * Note this is NOT an embedding store. It is a list of author account ids we use to represent + * topics + */ + private val semanticCoreTopicSeedStore: ReadableStore[ + SemanticCoreTopicSeedStore.Key, + Seq[UserId] + ] = { + /* + Up to 1000 Long seeds per topic/language = 62.5kb per topic/language (worst case) + Assume ~10k active topic/languages ~= 650MB (worst case) + */ + val underlying = new SemanticCoreTopicSeedStore(uttClient, interestsOptOutStore)( + stats.scope("semantic_core_topic_seed_store")) + + val memcacheStore = ObservedMemcachedReadableStore.fromCacheClient( + backingStore = underlying, + cacheClient = cacheClient, + ttl = 12.hours)( + valueInjection = SeqLongInjection, + statsReceiver = stats.scope("topic_producer_seed_store_mem_cache"), + keyToString = { k => s"tpss:${k.entityId}_${k.languageCode}" } + ) + + ObservedCachedReadableStore.from[SemanticCoreTopicSeedStore.Key, Seq[UserId]]( + store = memcacheStore, + ttl = 6.hours, + maxKeys = 20e3.toInt, + cacheName = "topic_producer_seed_store_cache", + windowSize = 5000 + )(stats.scope("topic_producer_seed_store_cache")) + } + + private val favBasedTfgTopicEmbedding20m145k2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/favBasedTFGTopic20M145K2020").mapValues( + embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) + .composeKeyMapping[LocaleEntityId] { localeEntityId => + SimClustersEmbeddingId( + FavTfgTopic, + Model20m145k2020, + InternalId.LocaleEntityId(localeEntityId)) + } + + buildLocaleEntityIdMemCacheStore(rawStore, FavTfgTopic, Model20m145k2020) + } + + private val logFavBasedApeEntity20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val apeStore = StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020") + .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50)) + .composeKeyMapping[UserId]({ id => + SimClustersEmbeddingId( + AggregatableLogFavBasedProducer, + Model20m145k2020, + InternalId.UserId(id)) + }) + val rawStore = new ApeEntityEmbeddingStore( + semanticCoreSeedStore = semanticCoreTopicSeedStore, + aggregatableProducerEmbeddingStore = apeStore, + statsReceiver = stats.scope("log_fav_based_ape_entity_2020_embedding_store")) + .mapValues(embedding => SimClustersEmbedding(embedding.toThrift, truncate = 50).toThrift) + .composeKeyMapping[TopicId] { topicId => + SimClustersEmbeddingId( + LogFavBasedKgoApeTopic, + Model20m145k2020, + InternalId.TopicId(topicId)) + } + + buildTopicIdMemCacheStore(rawStore, LogFavBasedKgoApeTopic, Model20m145k2020) + } + + private def buildTopicIdMemCacheStore( + rawStore: ReadableStore[TopicId, ThriftSimClustersEmbedding], + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val observedStore: ObservedReadableStore[TopicId, ThriftSimClustersEmbedding] = + ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.TopicId(topicId)) => + topicId + } + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + storeWithKeyMapping, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private def buildLocaleEntityIdMemCacheStore( + rawStore: ReadableStore[LocaleEntityId, ThriftSimClustersEmbedding], + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val observedStore: ObservedReadableStore[LocaleEntityId, ThriftSimClustersEmbedding] = + ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.LocaleEntityId(localeEntityId)) => + localeEntityId + } + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + storeWithKeyMapping, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private val underlyingStores: Map[ + (EmbeddingType, ModelVersion), + ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] + ] = Map( + // Topic Embeddings + (FavTfgTopic, Model20m145k2020) -> favBasedTfgTopicEmbedding20m145k2020Store, + (LogFavBasedKgoApeTopic, Model20m145k2020) -> logFavBasedApeEntity20M145K2020EmbeddingStore, + ) + + val topicSimClustersEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + SimClustersEmbeddingStore.buildWithDecider( + underlyingStores = underlyingStores, + decider = rmsDecider.decider, + statsReceiver = stats + ) + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala new file mode 100644 index 000000000..857e38649 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala @@ -0,0 +1,141 @@ +package com.twitter.representation_manager.store + +import com.twitter.finagle.memcached.Client +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.hermit.store.common.ObservedReadableStore +import com.twitter.representation_manager.common.MemCacheConfig +import com.twitter.representation_manager.common.RepresentationManagerDecider +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.common.TweetId +import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore +import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.simclusters_v2.thriftscala.ModelVersion._ +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams +import com.twitter.storehaus.ReadableStore +import javax.inject.Inject + +class TweetSimClustersEmbeddingStore @Inject() ( + cacheClient: Client, + globalStats: StatsReceiver, + mhMtlsParams: ManhattanKVClientMtlsParams, + rmsDecider: RepresentationManagerDecider) { + + private val stats = globalStats.scope(this.getClass.getSimpleName) + + val logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .longestL2NormTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset, + stats + ).mapValues(_.toThrift) + + buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145kUpdated) + } + + val logFavBasedLongestL2Tweet20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .longestL2NormTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, + stats + ).mapValues(_.toThrift) + + buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145k2020) + } + + val logFavBased20M145KUpdatedTweetEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .mostRecentTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset, + stats + ).mapValues(_.toThrift) + + buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145kUpdated) + } + + val logFavBased20M145K2020TweetEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + PersistentTweetEmbeddingStore + .mostRecentTweetEmbeddingStoreManhattan( + mhMtlsParams, + PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, + stats + ).mapValues(_.toThrift) + + buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145k2020) + } + + private def buildMemCacheStore( + rawStore: ReadableStore[TweetId, ThriftSimClustersEmbedding], + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val observedStore: ObservedReadableStore[TweetId, ThriftSimClustersEmbedding] = + ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.TweetId(tweetId)) => + tweetId + } + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + storeWithKeyMapping, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private val underlyingStores: Map[ + (EmbeddingType, ModelVersion), + ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] + ] = Map( + // Tweet Embeddings + (LogFavBasedTweet, Model20m145kUpdated) -> logFavBased20M145KUpdatedTweetEmbeddingStore, + (LogFavBasedTweet, Model20m145k2020) -> logFavBased20M145K2020TweetEmbeddingStore, + ( + LogFavLongestL2EmbeddingTweet, + Model20m145kUpdated) -> logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore, + ( + LogFavLongestL2EmbeddingTweet, + Model20m145k2020) -> logFavBasedLongestL2Tweet20M145K2020EmbeddingStore, + ) + + val tweetSimClustersEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + SimClustersEmbeddingStore.buildWithDecider( + underlyingStores = underlyingStores, + decider = rmsDecider.decider, + statsReceiver = stats + ) + } + +} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala new file mode 100644 index 000000000..b416d9b17 --- /dev/null +++ b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala @@ -0,0 +1,602 @@ +package com.twitter.representation_manager.store + +import com.twitter.contentrecommender.twistly +import com.twitter.finagle.memcached.Client +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.frigate.common.store.strato.StratoFetchableStore +import com.twitter.hermit.store.common.ObservedReadableStore +import com.twitter.representation_manager.common.MemCacheConfig +import com.twitter.representation_manager.common.RepresentationManagerDecider +import com.twitter.simclusters_v2.common.ModelVersions +import com.twitter.simclusters_v2.common.SimClustersEmbedding +import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore +import com.twitter.simclusters_v2.summingbird.stores.ProducerClusterEmbeddingReadableStores +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.getStore +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.modelVersionToDatasetMap +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.knownModelVersions +import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.toSimClustersEmbedding +import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn +import com.twitter.simclusters_v2.thriftscala.EmbeddingType +import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ +import com.twitter.simclusters_v2.thriftscala.InternalId +import com.twitter.simclusters_v2.thriftscala.ModelVersion +import com.twitter.simclusters_v2.thriftscala.ModelVersion._ +import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId +import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} +import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams +import com.twitter.storehaus.ReadableStore +import com.twitter.storehaus_internal.manhattan.Apollo +import com.twitter.storehaus_internal.manhattan.ManhattanCluster +import com.twitter.strato.client.{Client => StratoClient} +import com.twitter.strato.thrift.ScroogeConvImplicits._ +import com.twitter.tweetypie.util.UserId +import com.twitter.util.Future +import javax.inject.Inject + +class UserSimClustersEmbeddingStore @Inject() ( + stratoClient: StratoClient, + cacheClient: Client, + globalStats: StatsReceiver, + mhMtlsParams: ManhattanKVClientMtlsParams, + rmsDecider: RepresentationManagerDecider) { + + private val stats = globalStats.scope(this.getClass.getSimpleName) + + private val favBasedProducer20M145KUpdatedEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = ProducerClusterEmbeddingReadableStores + .getProducerTopKSimClustersEmbeddingsStore( + mhMtlsParams + ).mapValues { topSimClustersWithScore => + ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) + }.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => + userId + } + + buildMemCacheStore(rawStore, FavBasedProducer, Model20m145kUpdated) + } + + private val favBasedProducer20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = ProducerClusterEmbeddingReadableStores + .getProducerTopKSimClusters2020EmbeddingsStore( + mhMtlsParams + ).mapValues { topSimClustersWithScore => + ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) + }.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => + userId + } + + buildMemCacheStore(rawStore, FavBasedProducer, Model20m145k2020) + } + + private val followBasedProducer20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = ProducerClusterEmbeddingReadableStores + .getProducerTopKSimClustersEmbeddingsByFollowStore( + mhMtlsParams + ).mapValues { topSimClustersWithScore => + ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) + }.composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => + userId + } + + buildMemCacheStore(rawStore, FollowBasedProducer, Model20m145k2020) + } + + private val logFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020") + .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) + + buildMemCacheStore(rawStore, AggregatableLogFavBasedProducer, Model20m145k2020) + } + + private val rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + ThriftSimClustersEmbedding + ] = { + StratoFetchableStore + .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( + stratoClient, + "recommendations/simclusters_v2/embeddings/logFavBasedAPERelaxedFavEngagementThreshold20M145K2020") + .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) + } + + private val relaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore( + rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore, + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020) + } + + private val relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId( + RelaxedAggregatableLogFavBasedProducer, + Model20m145kUpdated, + internalId) => + SimClustersEmbeddingId( + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020, + internalId) + } + + buildMemCacheStore(rawStore, RelaxedAggregatableLogFavBasedProducer, Model20m145kUpdated) + } + + private val logFavBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedInFromAPE, + Model20m145k2020) + } + + private val followBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls, + FollowBasedUserInterestedInFromAPE, + Model20m145k2020) + } + + private val favBasedUserInterestedIn20M145KUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, + FavBasedUserInterestedIn, + Model20m145kUpdated) + } + + private val favBasedUserInterestedIn20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, + FavBasedUserInterestedIn, + Model20m145k2020) + } + + private val followBasedUserInterestedIn20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, + FollowBasedUserInterestedIn, + Model20m145k2020) + } + + private val logFavBasedUserInterestedIn20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedIn, + Model20m145k2020) + } + + private val favBasedUserInterestedInFromPE20M145KUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultIIPESimClustersEmbeddingStoreWithMtls, + FavBasedUserInterestedInFromPE, + Model20m145kUpdated) + } + + private val twistlyUserInterestedInStore: ReadableStore[ + SimClustersEmbeddingId, + ThriftSimClustersEmbedding + ] = { + val interestedIn20M145KUpdatedStore = { + UserInterestedInReadableStore.defaultStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145KUpdated + ) + } + val interestedIn20M145K2020Store = { + UserInterestedInReadableStore.defaultStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145K2020 + ) + } + val interestedInFromPE20M145KUpdatedStore = { + UserInterestedInReadableStore.defaultIIPEStoreWithMtls( + mhMtlsParams, + modelVersion = ModelVersions.Model20M145KUpdated) + } + val simClustersInterestedInStore: ReadableStore[ + (UserId, ModelVersion), + ClustersUserIsInterestedIn + ] = { + new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { + override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { + k match { + case (userId, Model20m145kUpdated) => + interestedIn20M145KUpdatedStore.get(userId) + case (userId, Model20m145k2020) => + interestedIn20M145K2020Store.get(userId) + case _ => + Future.None + } + } + } + } + val simClustersInterestedInFromProducerEmbeddingsStore: ReadableStore[ + (UserId, ModelVersion), + ClustersUserIsInterestedIn + ] = { + new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { + override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { + k match { + case (userId, ModelVersion.Model20m145kUpdated) => + interestedInFromPE20M145KUpdatedStore.get(userId) + case _ => + Future.None + } + } + } + } + new twistly.interestedin.EmbeddingStore( + interestedInStore = simClustersInterestedInStore, + interestedInFromProducerEmbeddingStore = simClustersInterestedInFromProducerEmbeddingsStore, + statsReceiver = stats + ).mapValues(_.toThrift) + } + + private val userNextInterestedIn20m145k2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildUserInterestedInStore( + UserInterestedInReadableStore.defaultNextInterestedInStoreWithMtls, + UserNextInterestedIn, + Model20m145k2020) + } + + private val filteredUserInterestedIn20m145kUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145kUpdated) + } + + private val filteredUserInterestedIn20m145k2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145k2020) + } + + private val filteredUserInterestedInFromPE20m145kUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore( + twistlyUserInterestedInStore, + FilteredUserInterestedInFromPE, + Model20m145kUpdated) + } + + private val unfilteredUserInterestedIn20m145kUpdatedStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore( + twistlyUserInterestedInStore, + UnfilteredUserInterestedIn, + Model20m145kUpdated) + } + + private val unfilteredUserInterestedIn20m145k2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + buildMemCacheStore(twistlyUserInterestedInStore, UnfilteredUserInterestedIn, Model20m145k2020) + } + + // [Experimental] User InterestedIn, generated by aggregating IIAPE embedding from AddressBook + + private val logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_maxpooling" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + private val logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_average" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedAverageAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + private val logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_booktype_maxpooling" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + private val logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_largestdim_maxpooling" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + private val logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_louvain_maxpooling" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + private val logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val datasetName = "addressbook_sims_embedding_iiape_connected_maxpooling" + val appId = "wtf_embedding_apollo" + buildUserInterestedInStoreGeneric( + simClustersEmbeddingStoreWithMtls, + LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020, + datasetName = datasetName, + appId = appId, + manhattanCluster = Apollo + ) + } + + /** + * Helper func to build a readable store for some UserInterestedIn embeddings with + * 1. A storeFunc from UserInterestedInReadableStore + * 2. EmbeddingType + * 3. ModelVersion + * 4. MemCacheConfig + * */ + private def buildUserInterestedInStore( + storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion) => ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ], + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = storeFunc(mhMtlsParams, embeddingType, modelVersion) + .mapValues(_.toThrift) + val observedStore = ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + observedStore, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private def buildUserInterestedInStoreGeneric( + storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion, String, String, + ManhattanCluster) => ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ], + embeddingType: EmbeddingType, + modelVersion: ModelVersion, + datasetName: String, + appId: String, + manhattanCluster: ManhattanCluster + ): ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + val rawStore = + storeFunc(mhMtlsParams, embeddingType, modelVersion, datasetName, appId, manhattanCluster) + .mapValues(_.toThrift) + val observedStore = ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + observedStore, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private def simClustersEmbeddingStoreWithMtls( + mhMtlsParams: ManhattanKVClientMtlsParams, + embeddingType: EmbeddingType, + modelVersion: ModelVersion, + datasetName: String, + appId: String, + manhattanCluster: ManhattanCluster + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + + if (!modelVersionToDatasetMap.contains(ModelVersions.toKnownForModelVersion(modelVersion))) { + throw new IllegalArgumentException( + "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) + } + getStore(appId, mhMtlsParams, datasetName, manhattanCluster) + .composeKeyMapping[SimClustersEmbeddingId] { + case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) + if theEmbeddingType == embeddingType && theModelVersion == modelVersion => + userId + }.mapValues(toSimClustersEmbedding(_, embeddingType)) + } + + private def buildMemCacheStore( + rawStore: ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding], + embeddingType: EmbeddingType, + modelVersion: ModelVersion + ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { + val observedStore = ObservedReadableStore( + store = rawStore + )(stats.scope(embeddingType.name).scope(modelVersion.name)) + + MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( + observedStore, + cacheClient, + embeddingType, + modelVersion, + stats + ) + } + + private val underlyingStores: Map[ + (EmbeddingType, ModelVersion), + ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] + ] = Map( + // KnownFor Embeddings + (FavBasedProducer, Model20m145kUpdated) -> favBasedProducer20M145KUpdatedEmbeddingStore, + (FavBasedProducer, Model20m145k2020) -> favBasedProducer20M145K2020EmbeddingStore, + (FollowBasedProducer, Model20m145k2020) -> followBasedProducer20M145K2020EmbeddingStore, + (AggregatableLogFavBasedProducer, Model20m145k2020) -> logFavBasedApe20M145K2020EmbeddingStore, + ( + RelaxedAggregatableLogFavBasedProducer, + Model20m145kUpdated) -> relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore, + ( + RelaxedAggregatableLogFavBasedProducer, + Model20m145k2020) -> relaxedLogFavBasedApe20M145K2020EmbeddingStore, + // InterestedIn Embeddings + ( + LogFavBasedUserInterestedInFromAPE, + Model20m145k2020) -> logFavBasedInterestedInFromAPE20M145K2020Store, + ( + FollowBasedUserInterestedInFromAPE, + Model20m145k2020) -> followBasedInterestedInFromAPE20M145K2020Store, + (FavBasedUserInterestedIn, Model20m145kUpdated) -> favBasedUserInterestedIn20M145KUpdatedStore, + (FavBasedUserInterestedIn, Model20m145k2020) -> favBasedUserInterestedIn20M145K2020Store, + (FollowBasedUserInterestedIn, Model20m145k2020) -> followBasedUserInterestedIn20M145K2020Store, + (LogFavBasedUserInterestedIn, Model20m145k2020) -> logFavBasedUserInterestedIn20M145K2020Store, + ( + FavBasedUserInterestedInFromPE, + Model20m145kUpdated) -> favBasedUserInterestedInFromPE20M145KUpdatedStore, + (FilteredUserInterestedIn, Model20m145kUpdated) -> filteredUserInterestedIn20m145kUpdatedStore, + (FilteredUserInterestedIn, Model20m145k2020) -> filteredUserInterestedIn20m145k2020Store, + ( + FilteredUserInterestedInFromPE, + Model20m145kUpdated) -> filteredUserInterestedInFromPE20m145kUpdatedStore, + ( + UnfilteredUserInterestedIn, + Model20m145kUpdated) -> unfilteredUserInterestedIn20m145kUpdatedStore, + (UnfilteredUserInterestedIn, Model20m145k2020) -> unfilteredUserInterestedIn20m145k2020Store, + (UserNextInterestedIn, Model20m145k2020) -> userNextInterestedIn20m145k2020Store, + ( + LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store, + ( + LogFavBasedUserInterestedAverageAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store, + ( + LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store, + ( + LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store, + ( + LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store, + ( + LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, + Model20m145k2020) -> logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store, + ) + + val userSimClustersEmbeddingStore: ReadableStore[ + SimClustersEmbeddingId, + SimClustersEmbedding + ] = { + SimClustersEmbeddingStore.buildWithDecider( + underlyingStores = underlyingStores, + decider = rmsDecider.decider, + statsReceiver = stats + ) + } + +} diff --git a/representation-manager/server/src/main/thrift/BUILD b/representation-manager/server/src/main/thrift/BUILD new file mode 100644 index 000000000..f4edb5dcb --- /dev/null +++ b/representation-manager/server/src/main/thrift/BUILD @@ -0,0 +1,18 @@ +create_thrift_libraries( + base_name = "thrift", + sources = [ + "com/twitter/representation_manager/service.thrift", + ], + platform = "java8", + tags = [ + "bazel-compatible", + ], + dependency_roots = [ + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift", + ], + generate_languages = [ + "java", + "scala", + "strato", + ], +) diff --git a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift new file mode 100644 index 000000000..4eb36e999 --- /dev/null +++ b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift @@ -0,0 +1,14 @@ +namespace java com.twitter.representation_manager.thriftjava +#@namespace scala com.twitter.representation_manager.thriftscala +#@namespace strato com.twitter.representation_manager + +include "com/twitter/simclusters_v2/online_store.thrift" +include "com/twitter/simclusters_v2/identifier.thrift" + +/** + * A uniform column view for all kinds of SimClusters based embeddings. + **/ +struct SimClustersEmbeddingView { + 1: required identifier.EmbeddingType embeddingType + 2: required online_store.ModelVersion modelVersion +}(persisted = 'false', hasPersonalData = 'false')