feat: improve JSON extraction

old-agentic-v1^2
Philipp Burckhardt 2023-06-21 14:33:15 -04:00
rodzic 8682fa71c8
commit 9a604891b8
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: A2C3BCA4F31D1DDD
6 zmienionych plików z 189 dodań i 170 usunięć

Wyświetl plik

@ -1,11 +1,87 @@
import { JSONRepairError, jsonrepair } from 'jsonrepair'
import { JsonValue } from 'type-fest'
import { ZodType, z } from 'zod'
import * as errors from '@/errors'
import {
extractJSONArrayFromString,
extractJSONObjectFromString
} from '@/utils'
/**
* Checks if character at the specified index in a string is escaped.
*
* @param str - string to check
* @param i - index of the character to check
* @returns whether the character is escaped
*/
function isEscaped(str: string, i: number): boolean {
return i > 0 && str[i - 1] === '\\' && !(i > 1 && str[i - 2] === '\\')
}
/**
* Extracts JSON objects or arrays from a string.
*
* @param input - string to extract JSON from
* @param jsonStructureType - type of JSON structure to extract
* @returns array of extracted JSON objects or arrays
*/
export function extractJSONFromString(
input: string,
jsonStructureType: 'object' | 'array'
) {
const startChar = jsonStructureType === 'object' ? '{' : '['
const endChar = jsonStructureType === 'object' ? '}' : ']'
const extractedJSONValues: JsonValue[] = []
let nestingLevel = 0
let startIndex = 0
const isInsideQuoted = { '"': false, "'": false }
for (let i = 0; i < input.length; i++) {
const ch = input.charAt(i)
switch (ch) {
case '"':
case "'":
if (!isInsideQuoted[ch === '"' ? "'" : '"'] && !isEscaped(input, i)) {
isInsideQuoted[ch] = !isInsideQuoted[ch]
}
break
default:
if (!isInsideQuoted['"'] && !isInsideQuoted["'"]) {
switch (ch) {
case startChar:
if (nestingLevel === 0) {
startIndex = i
}
nestingLevel += 1
break
case endChar:
nestingLevel -= 1
if (nestingLevel === 0) {
const candidate = input.slice(startIndex, i + 1)
const parsed = JSON.parse(jsonrepair(candidate))
if (parsed && typeof parsed === 'object') {
extractedJSONValues.push(parsed)
}
} else if (nestingLevel < 0) {
throw new Error(
`Invalid JSON string: unexpected ${endChar} at position ${i}`
)
}
}
}
}
}
if (nestingLevel !== 0) {
throw new Error(
'Invalid JSON string: unmatched ' + startChar + ' or ' + endChar
)
}
return extractedJSONValues
}
const BOOLEAN_OUTPUTS = {
true: true,
@ -28,8 +104,12 @@ const BOOLEAN_OUTPUTS = {
*/
export function parseArrayOutput(output: string): Array<any> {
try {
const trimmedOutput = extractJSONArrayFromString(output)
const parsedOutput = JSON.parse(jsonrepair(trimmedOutput ?? output))
const arr = extractJSONFromString(output, 'array')
if (arr.length === 0) {
throw new errors.OutputValidationError(`Invalid JSON array: ${output}`)
}
const parsedOutput = arr[0]
if (!Array.isArray(parsedOutput)) {
throw new errors.OutputValidationError(
`Invalid JSON array: ${JSON.stringify(parsedOutput)}`
@ -59,19 +139,22 @@ export function parseArrayOutput(output: string): Array<any> {
*/
export function parseObjectOutput(output: string) {
try {
const trimmedOutput = extractJSONObjectFromString(output)
output = JSON.parse(jsonrepair(trimmedOutput ?? output))
const arr = extractJSONFromString(output, 'object')
if (arr.length === 0) {
throw new errors.OutputValidationError(`Invalid JSON object: ${output}`)
}
if (Array.isArray(output)) {
let parsedOutput = arr[0]
if (Array.isArray(parsedOutput)) {
// TODO
output = output[0]
} else if (typeof output !== 'object') {
parsedOutput = parsedOutput[0]
} else if (typeof parsedOutput !== 'object') {
throw new errors.OutputValidationError(
`Invalid JSON object: ${JSON.stringify(output)}`
`Invalid JSON object: ${JSON.stringify(parsedOutput)}`
)
}
return output
return parsedOutput
} catch (err: any) {
if (err instanceof JSONRepairError) {
throw new errors.OutputValidationError(err.message, { cause: err })
@ -149,6 +232,7 @@ export function parseOutput(output: string, outputSchema: ZodType<any>) {
} else if (outputSchema instanceof z.ZodNumber) {
result = parseNumberOutput(output, outputSchema)
} else {
// Default to string output...
result = output
}

Wyświetl plik

@ -4,26 +4,6 @@ import { JsonValue } from 'type-fest'
import * as types from './types'
/**
* Extracts a JSON object string from a given string.
*
* @param text - string from which to extract the JSON object
* @returns extracted JSON object string, or `undefined` if no JSON object is found
*/
export function extractJSONObjectFromString(text: string): string | undefined {
return text.match(/\{(.|\n)*\}/gm)?.[0] // FIXME: This breaks if there are multiple JSON objects in the string
}
/**
* Extracts a JSON array string from a given string.
*
* @param text - string from which to extract the JSON array
* @returns extracted JSON array string, or `undefined` if no JSON array is found
*/
export function extractJSONArrayFromString(text: string): string | undefined {
return text.match(/\[(.|\n)*\]/gm)?.[0] // FIXME: This breaks if there are multiple JSON arrays in the string
}
/**
* Pauses the execution of a function for a specified time.
*

Wyświetl plik

@ -62,38 +62,11 @@ Generated by [AVA](https://avajs.dev).
},
]
## parseArrayOutput - handles and repairs broken JSON arrays correctly
> should repair and return [1, "two", 3] for [1, "two, 3]
[
1,
'two, 3]',
]
> should repair and return ["a", "b", "c"] for Array: ["a, "b", "c"]. Error here!
[
'a, ',
'b',
', ',
'c',
']',
]
> should repair and return {"arr": ["value1", "value2"]} for Array in text {"arr": ["value1, "value2"]}
[
'value1, ',
'value2',
']',
]
## parseArrayOutput - throws error for invalid arrays
> Snapshot 1
'Invalid JSON array: "not a valid array"'
'Invalid JSON array: not a valid array'
## parseObjectOutput - handles valid objects correctly
@ -131,25 +104,6 @@ Generated by [AVA](https://avajs.dev).
name: 'John',
}
## parseObjectOutput - handles and repairs broken JSON objects correctly
> should repair and return {"a":1, "b":2, "c":3} for {"a":1, "b":2, "c":3
{
a: 1,
b: 2,
c: 3,
}
> should repair and return {"name":"John","age":30,"city":"New York"} for Object: {"name":"John,"age":30,"city":"New York"}. Error here!
{
'New York': '}',
age: ':30,',
city: ':',
name: 'John,',
}
## parseObjectOutput - handles JSON array of objects
> should return first object {"a":1,"b":2} for [{"a":1,"b":2},{"c":3,"d":4}]
@ -163,7 +117,7 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
'Invalid JSON object: "not a valid object"'
'Invalid JSON object: not a valid object'
## parseBooleanOutput - handles `true` outputs correctly
@ -261,34 +215,3 @@ Generated by [AVA](https://avajs.dev).
> Snapshot 1
'Invalid number output: not a number'
## parseOutput - handles array correctly
> should parse and return [1, 2, 3] for "[1, 2, 3]"
[
1,
2,
3,
]
## parseOutput - handles object correctly
> should parse and return {"a": 1, "b": "two"} for "{"a": 1, "b": "two"}"
{
a: 1,
b: 'two',
}
## parseOutput - handles boolean correctly
> should parse and return true for "True"
true
## parseOutput - handles number correctly
> should parse and return 123.45 for "123.45"
123.45

Wyświetl plik

@ -2,6 +2,7 @@ import test from 'ava'
import { z } from 'zod'
import {
extractJSONFromString,
parseArrayOutput,
parseBooleanOutput,
parseNumberOutput,
@ -9,6 +10,95 @@ import {
parseOutput
} from '@/llms/parse-output'
test('extractJSONFromString should extract JSON object from string', (t) => {
let jsonStr = 'Some text {"name":"John Doe"} more text'
let result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], { name: 'John Doe' })
jsonStr =
'Some text {"name":"John Doe","age":42,"address":{"street":"Main Street","number":42}} more text'
result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], {
name: 'John Doe',
age: 42,
address: { street: 'Main Street', number: 42 }
})
jsonStr = 'foo {"name":"John Doe","school":"St. John\'s"} bar'
result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], { name: 'John Doe', school: "St. John's" })
})
test('extractJSONFromString should extract an invalid JSON object from string', (t) => {
let jsonStr = 'Some text {"name":\'John Doe\'} more text'
let result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], { name: 'John Doe' })
jsonStr = 'Some text {"name":"John Doe","age":42,} more text'
result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], { name: 'John Doe', age: 42 })
})
test('extractJSONFromString should extract multiple JSON objects from string', (t) => {
let jsonStr = 'Some text {"name":"John Doe"} more text {"name":"Jane Doe"}'
let result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], { name: 'John Doe' })
t.deepEqual(result[1], { name: 'Jane Doe' })
jsonStr =
'Some text {"name":"John Doe","age":42,"address":{"street":"Main Street","number":42}} more text {"name":"Jane Doe","age":42,"address":{"street":"Main Street","number":42}}'
result = extractJSONFromString(jsonStr, 'object')
t.deepEqual(result[0], {
name: 'John Doe',
age: 42,
address: { street: 'Main Street', number: 42 }
})
t.deepEqual(result[1], {
name: 'Jane Doe',
age: 42,
address: { street: 'Main Street', number: 42 }
})
})
test('extractJSONFromString should extract JSON array from string', (t) => {
let jsonString = 'Some text [1,2,3] more text'
let result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result[0], [1, 2, 3])
jsonString = 'Some text ["foo","bar","\'quoted\'"] more text'
result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result[0], ['foo', 'bar', "'quoted'"])
})
test('extractJSONFromString should extract an invalid JSON array from string', (t) => {
let jsonString = 'Some text [1,2,3,] more text'
let result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result[0], [1, 2, 3])
jsonString = "Some text ['foo','bar'] more text"
result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result[0], ['foo', 'bar'])
})
test('extractJSONFromString should extract multiple JSON arrays from string', (t) => {
const jsonString = 'Some text [1,2,3] more text [4,5,6]'
const result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result[0], [1, 2, 3])
t.deepEqual(result[1], [4, 5, 6])
})
test('extractJSONFromString should return an empty array if no JSON object is found', (t) => {
const jsonString = 'Some text'
const result = extractJSONFromString(jsonString, 'object')
t.deepEqual(result, [])
})
test('extractJSONFromString should return an empty array if no JSON array is found', (t) => {
const jsonString = 'Some text'
const result = extractJSONFromString(jsonString, 'array')
t.deepEqual(result, [])
})
test('parseArrayOutput - handles valid arrays correctly', (t) => {
const output1 = parseArrayOutput('[1,2,3]')
const output2 = parseArrayOutput('["a", "b", "c"]')
@ -40,22 +130,6 @@ test('parseArrayOutput - handles arrays surrounded by text correctly', (t) => {
)
})
test('parseArrayOutput - handles and repairs broken JSON arrays correctly', (t) => {
const output1 = parseArrayOutput('[1, "two, 3]')
const output2 = parseArrayOutput('Array: ["a, "b", "c"]. Error here!')
const output3 = parseArrayOutput('Array in text {"arr": ["value1, "value2"]}')
t.snapshot(output1, 'should repair and return [1, "two", 3] for [1, "two, 3]')
t.snapshot(
output2,
'should repair and return ["a", "b", "c"] for Array: ["a, "b", "c"]. Error here!'
)
t.snapshot(
output3,
'should repair and return {"arr": ["value1", "value2"]} for Array in text {"arr": ["value1, "value2"]}'
)
})
test('parseArrayOutput - throws error for invalid arrays', (t) => {
const error = t.throws(
() => {
@ -99,22 +173,6 @@ test('parseObjectOutput - handles objects surrounded by text correctly', (t) =>
)
})
test('parseObjectOutput - handles and repairs broken JSON objects correctly', (t) => {
const output1 = parseObjectOutput('{"a":1, "b":2, "c":3')
const output2 = parseObjectOutput(
'Object: {"name":"John,"age":30,"city":"New York"}. Error here!'
)
t.snapshot(
output1,
'should repair and return {"a":1, "b":2, "c":3} for {"a":1, "b":2, "c":3'
)
t.snapshot(
output2,
'should repair and return {"name":"John","age":30,"city":"New York"} for Object: {"name":"John,"age":30,"city":"New York"}. Error here!'
)
})
test('parseObjectOutput - handles JSON array of objects', (t) => {
const output = parseObjectOutput('[{"a":1,"b":2},{"c":3,"d":4}]')

26
test/utils.test.ts vendored
Wyświetl plik

@ -6,8 +6,6 @@ import {
chunkString,
defaultIDGeneratorFn,
extractFunctionIdentifierFromString,
extractJSONArrayFromString,
extractJSONObjectFromString,
isValidTaskIdentifier,
sleep,
stringifyForModel,
@ -33,30 +31,6 @@ test('isValidTaskIdentifier - invalid', async (t) => {
t.false(isValidTaskIdentifier('-foo'))
})
test('extractJSONObjectFromString should extract JSON object from string', (t) => {
const jsonString = 'Some text {"name":"John Doe"} more text'
const result = extractJSONObjectFromString(jsonString)
t.is(result, '{"name":"John Doe"}')
})
test('extractJSONArrayFromString should extract JSON array from string', (t) => {
const jsonString = 'Some text [1,2,3] more text'
const result = extractJSONArrayFromString(jsonString)
t.is(result, '[1,2,3]')
})
test('extractJSONObjectFromString should return undefined if no JSON object is found', (t) => {
const jsonString = 'Some text'
const result = extractJSONObjectFromString(jsonString)
t.is(result, undefined)
})
test('extractJSONArrayFromString should return undefined if no JSON array is found', (t) => {
const jsonString = 'Some text'
const result = extractJSONArrayFromString(jsonString)
t.is(result, undefined)
})
test('sleep should delay execution', async (t) => {
const start = Date.now()
await sleep(1000) // for example, 1000ms / 1sec