Tldraw/apps/docs/utils/ContentVectorDatabase.ts

import { connect } from '@/scripts/functions/connect'
import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types'
import { config } from 'dotenv'
import OpenAI from 'openai'
import path from 'path'
import { LocalIndex } from 'vectra'
import { nicelog } from './nicelog'

config()

const MAX_ARTICLES = Infinity
const INCLUDE_API_CONTENT = true
const INCLUDE_CONTENT = true

const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db'))

const openai = new OpenAI({
	apiKey: process.env.OPENAI_KEY,
})

export class ContentVectorDatabase {
	index: LocalIndex
	api: OpenAI

	constructor(opts = {} as { index: LocalIndex; api: OpenAI }) {
		this.index = opts.index
		this.api = opts.api
	}

	/**
	 * Get a vector from a piece of text from openai.
	 *
	 * @param text The text to get a vector for.
	 *
	 * @returns The vector.
	 */
	async getVectorEmbeddings(inputs: string[]) {
		const response = await this.api.embeddings.create({
			model: 'text-embedding-ada-002',
			input: inputs,
		})
		return response.data.map((d) => d.embedding)
	}

	async addHeadingToIndex(article: Article, heading: ArticleHeading) {
		const id = `${article.id}#${heading.slug}`

		// Skip headings that are already present
		const hash = this.getHashForString(heading.title + heading.slug)
		const existingItem = await this.index.getItem(id)
		if (existingItem) {
			if (existingItem.metadata.hash === hash) {
				nicelog(`Skipping heading ${id} (already present)`)
				return
			}
			await this.index.deleteItem(id)
		}

		nicelog(`Adding headers for ${article.title}#${heading.title}`)
		const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title])
		this.index.insertItem({
			id,
			vector: vectors[0],
			metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash },
		})
	}

	/**
	 * Add a text item to the index.
	 *
	 * @param text The text to add to the index.
	 *
	 * @returns The index item.
	 */
	async addArticleToIndex(article: Article, headings: ArticleHeadings) {
		// This is the content that we'll create the embedding for
		let contentToVectorize: string

		if (article.sectionId === 'reference') {
			// For API docs, we'll just use the title, description, and members as the content.
			// We'll also add a note that the content was generated from the API docs, hopefully
			// so that the embedding better reflects searches for api docs.
			contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)`
		} else {
			// The content is the raw markdown content, which includes all the markdown
			// headings and annotations, though none of the frontmatter. We'll add the
			// frontmatter information again manually. We may need to also add some information
			// about how "important" this article is, relative to related docs or headings.
			contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}`
		}

		if (headings.length) {
			for (const heading of headings) {
				await this.addHeadingToIndex(article, heading)
			}
		}

		// Generate a hash based on the content that we'd be vectorizing
		const hash = this.getHashForString(contentToVectorize)

		// Create chunks from the content; openAI has a limit of 500 tokens per request
		const chunksToAdd: string[] = []
		const chunkSize = 500
		for (let i = 0; i < contentToVectorize.length; i += chunkSize) {
			const chunk = contentToVectorize.slice(i, i + chunkSize)
			chunksToAdd.push(chunk)
		}

		// Is there already an item with this id?
		const existingItem = await this.index.getItem(article.id + '_0')

		if (existingItem) {
			// ...and if the existing item matches our hash, we can skip it
			if (existingItem.metadata.hash === hash) {
				nicelog(`Skipping ${article.id} (already present)`)
				return
			}

			// ...otherwise, delete all the chunks so that we can add a new one.
			for (let i = 0; i < chunksToAdd.length; i++) {
				await this.index.deleteItem(article.id + '_' + i)
			}
		}

		// Add chunks to index
		nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`)

		// Get an embedding / vector for all of the chunks
		const vectors = await this.getVectorEmbeddings(chunksToAdd)

		for (let i = 0; i < vectors.length; i++) {
			const vector = vectors[i]
			// Add the article item to the index (include the hash as metadata)
			await this.index.insertItem({
				id: article.id + '_' + i,
				vector: vector,
				metadata: { type: 'article', articleId: article.id, hash },
			})
		}

		// Sleep for 50ms or so to avoid rate limiting
		await new Promise((r) => setTimeout(r, 35))

		return
	}

	/**
	 * Query an item using our index.
	 *
	 * @param text The text to query.
	 *
	 * @returns The query results.
	 */
	async query(text: string, limit = 5) {
		const vector = await this.getVectorEmbeddings([text])
		const results = await this.index.queryItems(vector[0], limit)
		const output: (
			| { id: string; type: 'article'; score: number }
			| { id: string; type: 'heading'; slug: string; score: number }
		)[] = []
		const visited = new Set<string>()
		for (const result of results) {
			const id = result.item.metadata.articleId as string
			const type = result.item.metadata.type as 'article' | 'heading'
			if (type === 'heading') {
				const slug = result.item.metadata.slug as string
				output.push({ id, type, slug, score: result.score })
			} else {
				// multiple chunks may have been returned
				if (visited.has(id)) continue
				output.push({ id, type, score: result.score })
				visited.add(id)
			}
		}
		return output
	}

	/**
	 * Hash a string using the FNV-1a algorithm.
	 *
	 * @public
	 */
	getHashForString(string: string) {
		let hash = 0
		for (let i = 0; i < string.length; i++) {
			hash = (hash << 5) - hash + string.charCodeAt(i)
			hash |= 0 // Convert to 32bit integer
		}
		return hash + ''
	}
}

let _cvdb: ContentVectorDatabase

export async function getVectorDb(
	opts = {} as {
		updateContent?: boolean
		rebuildIndex?: boolean
	}
) {
	if (_cvdb) {
		return _cvdb
	}

	if (opts.rebuildIndex || !(await index.isIndexCreated())) {
		await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 })
	}

	_cvdb = new ContentVectorDatabase({ api: openai, index })

	if (opts.updateContent || opts.rebuildIndex) {
		nicelog(`Rebuilding index`)
		const db = await connect({ reset: false, mode: 'readonly' })

		nicelog(`Getting articles`)
		const articles =
			INCLUDE_API_CONTENT && INCLUDE_CONTENT
				? await db.all('SELECT * FROM articles')
				: INCLUDE_API_CONTENT
					? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'reference')
					: await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'reference')

		nicelog(`Adding articles to index`)
		const max = Math.min(articles.length, MAX_ARTICLES)
		for (let i = 0; i < max; i++) {
			const article = articles[i]
			const headings = await db.all(
				'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)',
				article.id,
				'constructor',
				'properties',
				'example',
				'methods'
			)
			nicelog(`Adding article ${article.id} to index (${i} of ${max})`)
			await _cvdb.addArticleToIndex(article, headings)
		}
	}

	return _cvdb
}
Add docs (#2470) This PR adds the docs app back into the tldraw monorepo. ## Deploying We'll want to update our deploy script to update the SOURCE_SHA to the newest release sha... and then deploy the docs pulling api.json files from that release. We _could_ update the docs on every push to main, but we don't have to unless something has changed. Right now there's no automated deployments from this repo. ## Side effects To make this one work, I needed to update the lock file. This might be ok (new year new lock file), and everything builds as expected, though we may want to spend some time with our scripts to be sure that things are all good. I also updated our prettier installation, which decided to add trailing commas to every generic type. Which is, I suppose, [correct behavior](https://github.com/prettier/prettier-vscode/issues/955)? But that caused diffs in every file, which is unfortunate. ### Change Type - [x] `internal` — Any other changes that don't affect the published package[^2] 2024-01-15 12:33:15 +00:00			`import { connect } from '@/scripts/functions/connect'`
			`import { Article, ArticleHeading, ArticleHeadings } from '@/types/content-types'`
			`import { config } from 'dotenv'`
			`import OpenAI from 'openai'`
			`import path from 'path'`
			`import { LocalIndex } from 'vectra'`
			`import { nicelog } from './nicelog'`

			`config()`

			`const MAX_ARTICLES = Infinity`
			`const INCLUDE_API_CONTENT = true`
			`const INCLUDE_CONTENT = true`

			`const index = new LocalIndex(path.join(process.cwd(), 'utils', 'vector-db'))`

			`const openai = new OpenAI({`
			`apiKey: process.env.OPENAI_KEY,`
			`})`

			`export class ContentVectorDatabase {`
			`index: LocalIndex`
			`api: OpenAI`

			`constructor(opts = {} as { index: LocalIndex; api: OpenAI }) {`
			`this.index = opts.index`
			`this.api = opts.api`
			`}`

			`/**`
			`* Get a vector from a piece of text from openai.`
			`*`
			`* @param text The text to get a vector for.`
			`*`
			`* @returns The vector.`
			`*/`
			`async getVectorEmbeddings(inputs: string[]) {`
			`const response = await this.api.embeddings.create({`
			`model: 'text-embedding-ada-002',`
			`input: inputs,`
			`})`
			`return response.data.map((d) => d.embedding)`
			`}`

			`async addHeadingToIndex(article: Article, heading: ArticleHeading) {`
			const id = `${article.id}#${heading.slug}`

			`// Skip headings that are already present`
			`const hash = this.getHashForString(heading.title + heading.slug)`
			`const existingItem = await this.index.getItem(id)`
			`if (existingItem) {`
			`if (existingItem.metadata.hash === hash) {`
			nicelog(`Skipping heading ${id} (already present)`)
			`return`
			`}`
			`await this.index.deleteItem(id)`
			`}`

			nicelog(`Adding headers for ${article.title}#${heading.title}`)
			`const vectors = await this.getVectorEmbeddings([article.title + '#' + heading.title])`
			`this.index.insertItem({`
			`id,`
			`vector: vectors[0],`
			`metadata: { type: 'heading', articleId: article.id, slug: heading.slug, hash },`
			`})`
			`}`

			`/**`
			`* Add a text item to the index.`
			`*`
			`* @param text The text to add to the index.`
			`*`
			`* @returns The index item.`
			`*/`
			`async addArticleToIndex(article: Article, headings: ArticleHeadings) {`
			`// This is the content that we'll create the embedding for`
			`let contentToVectorize: string`

docs: rework docs site to have different sections (#2686) This PR starts putting in place the high-level changes we want to make to the docs site. - It makes separate sections for Reference and Examples and Community. - Gets rid of the secondary sidebar and integrates it into the main sidebar. - Groups the reference articles by type. - Pulls in the examples alongside code and a live playground so people don't have to visit examples.tldraw.com separately. <img width="1458" alt="Screenshot 2024-01-30 at 09 43 46" src="https://github.com/tldraw/tldraw/assets/469604/4f5aa339-3a69-4d9b-9b9f-dfdddea623e8"> Again, this is the top-level changes and there's more to be done for the next PR(s): - create quick start page - clean up installation page - add accordion to Examples page prbly - put fun stuff in header (from footer) - landing page - something for landing page of API - search cmd-k and border - cleanup _sidebarReferenceContentLinks - external links _blank - address potential skew issue with code examples - have a link to other examples (next.js, etc.) ### Change Type - [x] `documentation` — Changes to the documentation only[^2] ### Test Plan 1. Make sure examples work! ### Release Notes - Rework our docs site to pull together the examples app and reference section more cohesively. --------- Co-authored-by: Taha <98838967+Taha-Hassan-Git@users.noreply.github.com> Co-authored-by: Steve Ruiz <steveruizok@gmail.com> Co-authored-by: Mitja Bezenšek <mitja.bezensek@gmail.com> Co-authored-by: alex <alex@dytry.ch> Co-authored-by: Lu Wilson <l2wilson94@gmail.com> Co-authored-by: Dan Groshev <git@dgroshev.com> 2024-01-30 14:19:25 +00:00			`if (article.sectionId === 'reference') {`
Add docs (#2470) This PR adds the docs app back into the tldraw monorepo. ## Deploying We'll want to update our deploy script to update the SOURCE_SHA to the newest release sha... and then deploy the docs pulling api.json files from that release. We _could_ update the docs on every push to main, but we don't have to unless something has changed. Right now there's no automated deployments from this repo. ## Side effects To make this one work, I needed to update the lock file. This might be ok (new year new lock file), and everything builds as expected, though we may want to spend some time with our scripts to be sure that things are all good. I also updated our prettier installation, which decided to add trailing commas to every generic type. Which is, I suppose, [correct behavior](https://github.com/prettier/prettier-vscode/issues/955)? But that caused diffs in every file, which is unfortunate. ### Change Type - [x] `internal` — Any other changes that don't affect the published package[^2] 2024-01-15 12:33:15 +00:00			`// For API docs, we'll just use the title, description, and members as the content.`
			`// We'll also add a note that the content was generated from the API docs, hopefully`
			`// so that the embedding better reflects searches for api docs.`
			contentToVectorize = `Title: ${article.title}\nPackage: @tldraw/${article.categoryId}\nDescription: ${article.description}\nMembers:${article.keywords}\n\n(content generated from API docs)`
			`} else {`
			`// The content is the raw markdown content, which includes all the markdown`
			`// headings and annotations, though none of the frontmatter. We'll add the`
			`// frontmatter information again manually. We may need to also add some information`
			`// about how "important" this article is, relative to related docs or headings.`
			contentToVectorize = `Title: ${article.title}\nDescription: ${article.description}\nKeywords:${article.keywords}\nMarkdown:\n${article.content}`
			`}`

			`if (headings.length) {`
			`for (const heading of headings) {`
			`await this.addHeadingToIndex(article, heading)`
			`}`
			`}`

			`// Generate a hash based on the content that we'd be vectorizing`
			`const hash = this.getHashForString(contentToVectorize)`

			`// Create chunks from the content; openAI has a limit of 500 tokens per request`
			`const chunksToAdd: string[] = []`
			`const chunkSize = 500`
			`for (let i = 0; i < contentToVectorize.length; i += chunkSize) {`
			`const chunk = contentToVectorize.slice(i, i + chunkSize)`
			`chunksToAdd.push(chunk)`
			`}`

			`// Is there already an item with this id?`
			`const existingItem = await this.index.getItem(article.id + '_0')`

			`if (existingItem) {`
			`// ...and if the existing item matches our hash, we can skip it`
			`if (existingItem.metadata.hash === hash) {`
			nicelog(`Skipping ${article.id} (already present)`)
			`return`
			`}`

			`// ...otherwise, delete all the chunks so that we can add a new one.`
			`for (let i = 0; i < chunksToAdd.length; i++) {`
			`await this.index.deleteItem(article.id + '_' + i)`
			`}`
			`}`

			`// Add chunks to index`
			nicelog(`Adding article ${article.title} (${chunksToAdd.length} chunks)`)

			`// Get an embedding / vector for all of the chunks`
			`const vectors = await this.getVectorEmbeddings(chunksToAdd)`

			`for (let i = 0; i < vectors.length; i++) {`
			`const vector = vectors[i]`
			`// Add the article item to the index (include the hash as metadata)`
			`await this.index.insertItem({`
			`id: article.id + '_' + i,`
			`vector: vector,`
			`metadata: { type: 'article', articleId: article.id, hash },`
			`})`
			`}`

			`// Sleep for 50ms or so to avoid rate limiting`
			`await new Promise((r) => setTimeout(r, 35))`

			`return`
			`}`

			`/**`
			`* Query an item using our index.`
			`*`
			`* @param text The text to query.`
			`*`
			`* @returns The query results.`
			`*/`
			`async query(text: string, limit = 5) {`
			`const vector = await this.getVectorEmbeddings([text])`
			`const results = await this.index.queryItems(vector[0], limit)`
			`const output: (`
			`\| { id: string; type: 'article'; score: number }`
			`\| { id: string; type: 'heading'; slug: string; score: number }`
			`)[] = []`
			`const visited = new Set<string>()`
			`for (const result of results) {`
			`const id = result.item.metadata.articleId as string`
			`const type = result.item.metadata.type as 'article' \| 'heading'`
			`if (type === 'heading') {`
			`const slug = result.item.metadata.slug as string`
			`output.push({ id, type, slug, score: result.score })`
			`} else {`
			`// multiple chunks may have been returned`
			`if (visited.has(id)) continue`
			`output.push({ id, type, score: result.score })`
			`visited.add(id)`
			`}`
			`}`
			`return output`
			`}`

			`/**`
			`* Hash a string using the FNV-1a algorithm.`
			`*`
			`* @public`
			`*/`
			`getHashForString(string: string) {`
			`let hash = 0`
			`for (let i = 0; i < string.length; i++) {`
			`hash = (hash << 5) - hash + string.charCodeAt(i)`
			`hash \|= 0 // Convert to 32bit integer`
			`}`
			`return hash + ''`
			`}`
			`}`

			`let _cvdb: ContentVectorDatabase`

			`export async function getVectorDb(`
			`opts = {} as {`
			`updateContent?: boolean`
			`rebuildIndex?: boolean`
			`}`
			`) {`
			`if (_cvdb) {`
			`return _cvdb`
			`}`

			`if (opts.rebuildIndex \|\| !(await index.isIndexCreated())) {`
			`await index.createIndex({ deleteIfExists: opts.rebuildIndex, version: 1 })`
			`}`

			`_cvdb = new ContentVectorDatabase({ api: openai, index })`

			`if (opts.updateContent \|\| opts.rebuildIndex) {`
			nicelog(`Rebuilding index`)
Don't check api.json files into git (#3565) These were needed when the docs lived in a different repo, but they don't any more so we can get rid of them. ### Change Type - [x] `internal` — Does not affect user-facing stuff - [x] `chore` — Updating dependencies, other boring stuff 2024-04-24 15:58:26 +00:00			`const db = await connect({ reset: false, mode: 'readonly' })`
Add docs (#2470) This PR adds the docs app back into the tldraw monorepo. ## Deploying We'll want to update our deploy script to update the SOURCE_SHA to the newest release sha... and then deploy the docs pulling api.json files from that release. We _could_ update the docs on every push to main, but we don't have to unless something has changed. Right now there's no automated deployments from this repo. ## Side effects To make this one work, I needed to update the lock file. This might be ok (new year new lock file), and everything builds as expected, though we may want to spend some time with our scripts to be sure that things are all good. I also updated our prettier installation, which decided to add trailing commas to every generic type. Which is, I suppose, [correct behavior](https://github.com/prettier/prettier-vscode/issues/955)? But that caused diffs in every file, which is unfortunate. ### Change Type - [x] `internal` — Any other changes that don't affect the published package[^2] 2024-01-15 12:33:15 +00:00
			nicelog(`Getting articles`)
			`const articles =`
			`INCLUDE_API_CONTENT && INCLUDE_CONTENT`
			`? await db.all('SELECT * FROM articles')`
			`: INCLUDE_API_CONTENT`
Unbiome (#2776) Biome as it is now didn't work out for us 😢 Summary for posterity: * it IS much, much faster, fast enough to skip any sort of caching * we couldn't fully replace Prettier just yet. We use Prettier programmatically to format code in docs, and Biome's JS interface is officially alpha and [had legacy peer deps set](https://github.com/biomejs/biome/pull/1756) (which would fail our CI build as we don't allow installation warnings) * ternary formatting differs from Prettier, leading to a large diff https://github.com/biomejs/biome/issues/1661 * import sorting differs from Prettier's `prettier-plugin-organize-imports`, making the diff even bigger * the deal breaker is a multi-second delay on saving large files (for us it's [Editor.ts](https://github.com/tldraw/tldraw/blob/main/packages/editor/src/lib/editor/Editor.ts)) in VSCode when import sorting is enabled. There is a seemingly relevant Biome issue where I posted a small summary of our findings: https://github.com/biomejs/biome/issues/1569#issuecomment-1930411623 Further actions: * reevaluate in a few months as Biome matures ### Change Type - [x] `internal` — Any other changes that don't affect the published package 2024-02-07 16:02:22 +00:00			`? await db.all('SELECT * FROM articles WHERE articles.sectionId = ?', 'reference')`
			`: await db.all('SELECT * FROM articles WHERE articles.sectionId != ?', 'reference')`
Add docs (#2470) This PR adds the docs app back into the tldraw monorepo. ## Deploying We'll want to update our deploy script to update the SOURCE_SHA to the newest release sha... and then deploy the docs pulling api.json files from that release. We _could_ update the docs on every push to main, but we don't have to unless something has changed. Right now there's no automated deployments from this repo. ## Side effects To make this one work, I needed to update the lock file. This might be ok (new year new lock file), and everything builds as expected, though we may want to spend some time with our scripts to be sure that things are all good. I also updated our prettier installation, which decided to add trailing commas to every generic type. Which is, I suppose, [correct behavior](https://github.com/prettier/prettier-vscode/issues/955)? But that caused diffs in every file, which is unfortunate. ### Change Type - [x] `internal` — Any other changes that don't affect the published package[^2] 2024-01-15 12:33:15 +00:00
			nicelog(`Adding articles to index`)
			`const max = Math.min(articles.length, MAX_ARTICLES)`
			`for (let i = 0; i < max; i++) {`
			`const article = articles[i]`
			`const headings = await db.all(`
			`'SELECT * FROM headings WHERE articleId = ? AND slug NOT IN (?, ?, ?, ?)',`
			`article.id,`
			`'constructor',`
			`'properties',`
			`'example',`
			`'methods'`
			`)`
			nicelog(`Adding article ${article.id} to index (${i} of ${max})`)
			`await _cvdb.addArticleToIndex(article, headings)`
			`}`
			`}`

			`return _cvdb`
			`}`