From 9a604891b80836bddbf8e0c5d5d73cd23dd1de0d Mon Sep 17 00:00:00 2001 From: Philipp Burckhardt Date: Wed, 21 Jun 2023 14:33:15 -0400 Subject: [PATCH] feat: improve JSON extraction --- src/llms/parse-output.ts | 110 ++++++++++++++-- src/utils.ts | 20 --- .../test/llms/parse-output.test.ts.md | 81 +----------- .../test/llms/parse-output.test.ts.snap | Bin 1516 -> 1217 bytes test/llms/parse-output.test.ts | 122 +++++++++++++----- test/utils.test.ts | 26 ---- 6 files changed, 189 insertions(+), 170 deletions(-) diff --git a/src/llms/parse-output.ts b/src/llms/parse-output.ts index 394c263..3dea6f6 100644 --- a/src/llms/parse-output.ts +++ b/src/llms/parse-output.ts @@ -1,11 +1,87 @@ import { JSONRepairError, jsonrepair } from 'jsonrepair' +import { JsonValue } from 'type-fest' import { ZodType, z } from 'zod' import * as errors from '@/errors' -import { - extractJSONArrayFromString, - extractJSONObjectFromString -} from '@/utils' + +/** + * Checks if character at the specified index in a string is escaped. + * + * @param str - string to check + * @param i - index of the character to check + * @returns whether the character is escaped + */ +function isEscaped(str: string, i: number): boolean { + return i > 0 && str[i - 1] === '\\' && !(i > 1 && str[i - 2] === '\\') +} + +/** + * Extracts JSON objects or arrays from a string. + * + * @param input - string to extract JSON from + * @param jsonStructureType - type of JSON structure to extract + * @returns array of extracted JSON objects or arrays + */ +export function extractJSONFromString( + input: string, + jsonStructureType: 'object' | 'array' +) { + const startChar = jsonStructureType === 'object' ? '{' : '[' + const endChar = jsonStructureType === 'object' ? '}' : ']' + const extractedJSONValues: JsonValue[] = [] + let nestingLevel = 0 + let startIndex = 0 + const isInsideQuoted = { '"': false, "'": false } + + for (let i = 0; i < input.length; i++) { + const ch = input.charAt(i) + switch (ch) { + case '"': + case "'": + if (!isInsideQuoted[ch === '"' ? "'" : '"'] && !isEscaped(input, i)) { + isInsideQuoted[ch] = !isInsideQuoted[ch] + } + + break + + default: + if (!isInsideQuoted['"'] && !isInsideQuoted["'"]) { + switch (ch) { + case startChar: + if (nestingLevel === 0) { + startIndex = i + } + + nestingLevel += 1 + + break + + case endChar: + nestingLevel -= 1 + if (nestingLevel === 0) { + const candidate = input.slice(startIndex, i + 1) + const parsed = JSON.parse(jsonrepair(candidate)) + if (parsed && typeof parsed === 'object') { + extractedJSONValues.push(parsed) + } + } else if (nestingLevel < 0) { + throw new Error( + `Invalid JSON string: unexpected ${endChar} at position ${i}` + ) + } + } + } + } + } + + if (nestingLevel !== 0) { + throw new Error( + 'Invalid JSON string: unmatched ' + startChar + ' or ' + endChar + ) + } + + return extractedJSONValues +} const BOOLEAN_OUTPUTS = { true: true, @@ -28,8 +104,12 @@ const BOOLEAN_OUTPUTS = { */ export function parseArrayOutput(output: string): Array { try { - const trimmedOutput = extractJSONArrayFromString(output) - const parsedOutput = JSON.parse(jsonrepair(trimmedOutput ?? output)) + const arr = extractJSONFromString(output, 'array') + if (arr.length === 0) { + throw new errors.OutputValidationError(`Invalid JSON array: ${output}`) + } + + const parsedOutput = arr[0] if (!Array.isArray(parsedOutput)) { throw new errors.OutputValidationError( `Invalid JSON array: ${JSON.stringify(parsedOutput)}` @@ -59,19 +139,22 @@ export function parseArrayOutput(output: string): Array { */ export function parseObjectOutput(output: string) { try { - const trimmedOutput = extractJSONObjectFromString(output) - output = JSON.parse(jsonrepair(trimmedOutput ?? output)) + const arr = extractJSONFromString(output, 'object') + if (arr.length === 0) { + throw new errors.OutputValidationError(`Invalid JSON object: ${output}`) + } - if (Array.isArray(output)) { + let parsedOutput = arr[0] + if (Array.isArray(parsedOutput)) { // TODO - output = output[0] - } else if (typeof output !== 'object') { + parsedOutput = parsedOutput[0] + } else if (typeof parsedOutput !== 'object') { throw new errors.OutputValidationError( - `Invalid JSON object: ${JSON.stringify(output)}` + `Invalid JSON object: ${JSON.stringify(parsedOutput)}` ) } - return output + return parsedOutput } catch (err: any) { if (err instanceof JSONRepairError) { throw new errors.OutputValidationError(err.message, { cause: err }) @@ -149,6 +232,7 @@ export function parseOutput(output: string, outputSchema: ZodType) { } else if (outputSchema instanceof z.ZodNumber) { result = parseNumberOutput(output, outputSchema) } else { + // Default to string output... result = output } diff --git a/src/utils.ts b/src/utils.ts index 6156256..3e8a2c2 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -4,26 +4,6 @@ import { JsonValue } from 'type-fest' import * as types from './types' -/** - * Extracts a JSON object string from a given string. - * - * @param text - string from which to extract the JSON object - * @returns extracted JSON object string, or `undefined` if no JSON object is found - */ -export function extractJSONObjectFromString(text: string): string | undefined { - return text.match(/\{(.|\n)*\}/gm)?.[0] // FIXME: This breaks if there are multiple JSON objects in the string -} - -/** - * Extracts a JSON array string from a given string. - * - * @param text - string from which to extract the JSON array - * @returns extracted JSON array string, or `undefined` if no JSON array is found - */ -export function extractJSONArrayFromString(text: string): string | undefined { - return text.match(/\[(.|\n)*\]/gm)?.[0] // FIXME: This breaks if there are multiple JSON arrays in the string -} - /** * Pauses the execution of a function for a specified time. * diff --git a/test/.snapshots/test/llms/parse-output.test.ts.md b/test/.snapshots/test/llms/parse-output.test.ts.md index 1fadd79..f59dbc4 100644 --- a/test/.snapshots/test/llms/parse-output.test.ts.md +++ b/test/.snapshots/test/llms/parse-output.test.ts.md @@ -62,38 +62,11 @@ Generated by [AVA](https://avajs.dev). }, ] -## parseArrayOutput - handles and repairs broken JSON arrays correctly - -> should repair and return [1, "two", 3] for [1, "two, 3] - - [ - 1, - 'two, 3]', - ] - -> should repair and return ["a", "b", "c"] for Array: ["a, "b", "c"]. Error here! - - [ - 'a, ', - 'b', - ', ', - 'c', - ']', - ] - -> should repair and return {"arr": ["value1", "value2"]} for Array in text {"arr": ["value1, "value2"]} - - [ - 'value1, ', - 'value2', - ']', - ] - ## parseArrayOutput - throws error for invalid arrays > Snapshot 1 - 'Invalid JSON array: "not a valid array"' + 'Invalid JSON array: not a valid array' ## parseObjectOutput - handles valid objects correctly @@ -131,25 +104,6 @@ Generated by [AVA](https://avajs.dev). name: 'John', } -## parseObjectOutput - handles and repairs broken JSON objects correctly - -> should repair and return {"a":1, "b":2, "c":3} for {"a":1, "b":2, "c":3 - - { - a: 1, - b: 2, - c: 3, - } - -> should repair and return {"name":"John","age":30,"city":"New York"} for Object: {"name":"John,"age":30,"city":"New York"}. Error here! - - { - 'New York': '}', - age: ':30,', - city: ':', - name: 'John,', - } - ## parseObjectOutput - handles JSON array of objects > should return first object {"a":1,"b":2} for [{"a":1,"b":2},{"c":3,"d":4}] @@ -163,7 +117,7 @@ Generated by [AVA](https://avajs.dev). > Snapshot 1 - 'Invalid JSON object: "not a valid object"' + 'Invalid JSON object: not a valid object' ## parseBooleanOutput - handles `true` outputs correctly @@ -261,34 +215,3 @@ Generated by [AVA](https://avajs.dev). > Snapshot 1 'Invalid number output: not a number' - -## parseOutput - handles array correctly - -> should parse and return [1, 2, 3] for "[1, 2, 3]" - - [ - 1, - 2, - 3, - ] - -## parseOutput - handles object correctly - -> should parse and return {"a": 1, "b": "two"} for "{"a": 1, "b": "two"}" - - { - a: 1, - b: 'two', - } - -## parseOutput - handles boolean correctly - -> should parse and return true for "True" - - true - -## parseOutput - handles number correctly - -> should parse and return 123.45 for "123.45" - - 123.45 diff --git a/test/.snapshots/test/llms/parse-output.test.ts.snap b/test/.snapshots/test/llms/parse-output.test.ts.snap index eb379dba43408eec2d5737eb93883fddb4be6a9b..5832b6147597987390bcf002e54960389a26e120 100644 GIT binary patch literal 1217 zcmV;y1U~ygRzVxOo`T1aD9pp6ra-;h4d7I1J-{gcJC67>)Wc6nzQo5lV57 zMJJ;q4iA$dVafN;`u&iF_o7*(EsvBxJ3cENuzRD4dqdugVjcE+`n;!H%*t!n~ghIq$ zeXn&NX@JxPaEq6(<^#B}0zha(6ir3Xumq3Bj=V<26NoXYKk@1>#(#0A`;cc7RMYD6^0GAhm7DyQH>=rZFUm2O zM9dRjCMk4vcTGYyG4JPyIprh-uzBXeQ_e&H{WHamtHWsb!xHUk)h1&~$vE!gKJ1=A zhL71r6LMZ5wvhLRScL*d3A`|>{&WUa%L#*gUjyz9L$=2A6Gf}&{~`7L8KusX&6CW3 zS(dn0*pQN=1Y*7dsGMLFXLt08{GU8kd1$41Sg5RktGBfI=I+k6 z@(Ex}7!0!qg8|Tz02FZr2MLj#&XeRjt%M)4k`^lXfR{VG#Q8LVsWjC}(o~S3d`B<0 zHs7-|CzH|iL?V2Tt2t<$NnRc8I-Z|h@!aL*oB89Zg$G#UyOQgw_eSi*yjGup@s<1BM4GELs8REvlDC}>ZvM{O4og@?j47y4A_ zSf|38bt({@v+S{;+6;(}1o06e5f0<2bUa{mj2}QE+snQ&wo(w)%95t+HJdUqlwMZ6 zhW?7*?K>Zr_+2UT6SYRFzZ>Nd55g#!JLF{z;q{zK#8?W$C!*>AKZEDS)~+jT;|U8r zWp%Rf+^H@WoMK>R!FAQZ+D~duH}U>4woz=4A9pdGVt*7fe1K^g`}#=vJOxp&fVVXj zuh>+yeZ@oDM-Hbyw9~7$(|SiaK~Y2Y^{D{$AR-}~0rPHV`b%J;?Kj%KJN>fDfN!Mu zZ(9S@*E@}l_Fv5h(MZEkFUa0?a*@n0f_E$ttQrx-a@kXld0N2jGR)j8prj)a<#921^p0&Uhn|`a&HcRYU5eQ|aV5oWu-E{oMepyP_77aQ?e); z8*qsM_aBQ000000000B!SX*!0L=@iHOHy%B62SvVP$q*)fI3Up*|cB~YKqdTAta(n zxu}X-@9u1N8`q9(FG&=(Qi}wyJi-HlmOsH05Bv#!3@<(7nelqOWA84gkd~?wk7v%z zeCM3+oH^?U?S|tu_ktf@dPu`==+ML3zU>Ee#rN%_&0e_Q3-KcE*lx?A0shK%x-D$8 zxd1mk->1#cIqC+ky&rVEF!<(St7V7w7fz`#U10yvDO9e*(}K1J?FO_L(9|5NLjWZg1C;vFF|P?Q5jF;Jq8-Js6l89>l;TG*#y&v)^Snt@FN zFYVyA=VJn|rm?g`1VCR0a2?vs$pBX75VBQ>Y^b4vWJ65>(@eo6HZd@1us=;A!K#A- z^PdUKrE~B+5AEV8ykM_q5&D=fgr3iyi0x7Q5xPIi^;4UT+u%)9jJvb5vys?uAo6=! ztfY!PAMH8AbRO7c;6mMWLpoF#=KYjmj==~PuJsLk43@C4+2_{LO-Sy3n8RJc+MwtA zUe9gO7H%BjkRFC7jmT++s6^iGP_7E@2KdCV`pOurjt;cab5*?EvBL%MkK@df{|D7? zj7fEHHaHpnhq2SOvRyZlU}PpT6h6ue1(%(@Z+HCwH+*l8x_E7Cb7PFeepBa|%VRhu zFBZT=)8T=~I4%ryAvO>LDe%V8KtPzp5g|v|fI#_c0_D^P@Vo@=GiY6CeQ4j0LR?A5 zL&-MqI1y1Ch?NFWB8STHkkoWDTT@L84MIqXpjAj|KJHMT&I=3N)n$S$XnPZ9f>Utm z_h?O{{!}z$4rc|a9L-b?SnT_RX~K7Bs_@;&PJf_k#B7$L)AC7i$#_%4_^{)92LYy$ zf!MZPH(A3B2!8JYH{5b;xJqFTEnF4igYd90alH`R$yz85x}^vLPZk-)lNU{DgY3R= z-gimou>fNfz7EYzrzSL%no2pTDI+PRCz`4I=sh)CMG2^8u;gPJz&~q=J4`b!^!Z7r z&*#v-o?M@jy-NkY)2XR)vl|}8tlgjo_)E{QH z7Bo5sioqc;EVn08`EkW1RW9aHMUrH^V&#PL=IVI7i7FbdFuUqBV&qBU(cOQqHJY+Ot4$dPZvhrA1VN@}MVZO~V%tVaT5;C8U$#O*bsE|R&!R3m*wj^QVY8KA9 zovb}gc)knm)6q^DBGG8t2_vqlvD}7u31zBE=Ba+-hp8Mgjw50jq(!Xy;1_CLXNb%y zW!M&?Ou3+xDQ_udiYGj#)G2&0P7liwj7y&9P}?1H$gR-t(OcMy#`As;tj92lg&ASA zS5>32twf%O9{D@#u3cZvS$8^X9qUzNjnC(6+_s%yVwGnFgx69$VS8~5mswW>evFkX z>svatZhpgXqL%K`-JnM`5 z=aU!&q}sQIDPC4hQ8&3A>L$@F`b#W&MP0PG%okzSOV#rksN0Tbhht#gN*q6pDs{73 zH}&q1rX=`kjQ^UlL3wexx-8bur#&-Aq!w+yZVbut8`s0K8O_{&OR$kDnnNwFuJ>7rla$_Y&c;KbVry)(n zS0-L0@eeMdK-JB}`oF2yUnGP)1L!jQLKeF8osw^TPR_i;x0L{!s+mjGx(ufIKE^UwF7M&T8)6x@ S7u { + let jsonStr = 'Some text {"name":"John Doe"} more text' + let result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { name: 'John Doe' }) + + jsonStr = + 'Some text {"name":"John Doe","age":42,"address":{"street":"Main Street","number":42}} more text' + result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { + name: 'John Doe', + age: 42, + address: { street: 'Main Street', number: 42 } + }) + + jsonStr = 'foo {"name":"John Doe","school":"St. John\'s"} bar' + result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { name: 'John Doe', school: "St. John's" }) +}) + +test('extractJSONFromString should extract an invalid JSON object from string', (t) => { + let jsonStr = 'Some text {"name":\'John Doe\'} more text' + let result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { name: 'John Doe' }) + + jsonStr = 'Some text {"name":"John Doe","age":42,} more text' + result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { name: 'John Doe', age: 42 }) +}) + +test('extractJSONFromString should extract multiple JSON objects from string', (t) => { + let jsonStr = 'Some text {"name":"John Doe"} more text {"name":"Jane Doe"}' + let result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { name: 'John Doe' }) + t.deepEqual(result[1], { name: 'Jane Doe' }) + + jsonStr = + 'Some text {"name":"John Doe","age":42,"address":{"street":"Main Street","number":42}} more text {"name":"Jane Doe","age":42,"address":{"street":"Main Street","number":42}}' + result = extractJSONFromString(jsonStr, 'object') + t.deepEqual(result[0], { + name: 'John Doe', + age: 42, + address: { street: 'Main Street', number: 42 } + }) + t.deepEqual(result[1], { + name: 'Jane Doe', + age: 42, + address: { street: 'Main Street', number: 42 } + }) +}) + +test('extractJSONFromString should extract JSON array from string', (t) => { + let jsonString = 'Some text [1,2,3] more text' + let result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result[0], [1, 2, 3]) + + jsonString = 'Some text ["foo","bar","\'quoted\'"] more text' + result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result[0], ['foo', 'bar', "'quoted'"]) +}) + +test('extractJSONFromString should extract an invalid JSON array from string', (t) => { + let jsonString = 'Some text [1,2,3,] more text' + let result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result[0], [1, 2, 3]) + + jsonString = "Some text ['foo','bar'] more text" + result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result[0], ['foo', 'bar']) +}) + +test('extractJSONFromString should extract multiple JSON arrays from string', (t) => { + const jsonString = 'Some text [1,2,3] more text [4,5,6]' + const result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result[0], [1, 2, 3]) + t.deepEqual(result[1], [4, 5, 6]) +}) + +test('extractJSONFromString should return an empty array if no JSON object is found', (t) => { + const jsonString = 'Some text' + const result = extractJSONFromString(jsonString, 'object') + t.deepEqual(result, []) +}) + +test('extractJSONFromString should return an empty array if no JSON array is found', (t) => { + const jsonString = 'Some text' + const result = extractJSONFromString(jsonString, 'array') + t.deepEqual(result, []) +}) + test('parseArrayOutput - handles valid arrays correctly', (t) => { const output1 = parseArrayOutput('[1,2,3]') const output2 = parseArrayOutput('["a", "b", "c"]') @@ -40,22 +130,6 @@ test('parseArrayOutput - handles arrays surrounded by text correctly', (t) => { ) }) -test('parseArrayOutput - handles and repairs broken JSON arrays correctly', (t) => { - const output1 = parseArrayOutput('[1, "two, 3]') - const output2 = parseArrayOutput('Array: ["a, "b", "c"]. Error here!') - const output3 = parseArrayOutput('Array in text {"arr": ["value1, "value2"]}') - - t.snapshot(output1, 'should repair and return [1, "two", 3] for [1, "two, 3]') - t.snapshot( - output2, - 'should repair and return ["a", "b", "c"] for Array: ["a, "b", "c"]. Error here!' - ) - t.snapshot( - output3, - 'should repair and return {"arr": ["value1", "value2"]} for Array in text {"arr": ["value1, "value2"]}' - ) -}) - test('parseArrayOutput - throws error for invalid arrays', (t) => { const error = t.throws( () => { @@ -99,22 +173,6 @@ test('parseObjectOutput - handles objects surrounded by text correctly', (t) => ) }) -test('parseObjectOutput - handles and repairs broken JSON objects correctly', (t) => { - const output1 = parseObjectOutput('{"a":1, "b":2, "c":3') - const output2 = parseObjectOutput( - 'Object: {"name":"John,"age":30,"city":"New York"}. Error here!' - ) - - t.snapshot( - output1, - 'should repair and return {"a":1, "b":2, "c":3} for {"a":1, "b":2, "c":3' - ) - t.snapshot( - output2, - 'should repair and return {"name":"John","age":30,"city":"New York"} for Object: {"name":"John,"age":30,"city":"New York"}. Error here!' - ) -}) - test('parseObjectOutput - handles JSON array of objects', (t) => { const output = parseObjectOutput('[{"a":1,"b":2},{"c":3,"d":4}]') diff --git a/test/utils.test.ts b/test/utils.test.ts index 0e4804e..59079f3 100644 --- a/test/utils.test.ts +++ b/test/utils.test.ts @@ -6,8 +6,6 @@ import { chunkString, defaultIDGeneratorFn, extractFunctionIdentifierFromString, - extractJSONArrayFromString, - extractJSONObjectFromString, isValidTaskIdentifier, sleep, stringifyForModel, @@ -33,30 +31,6 @@ test('isValidTaskIdentifier - invalid', async (t) => { t.false(isValidTaskIdentifier('-foo')) }) -test('extractJSONObjectFromString should extract JSON object from string', (t) => { - const jsonString = 'Some text {"name":"John Doe"} more text' - const result = extractJSONObjectFromString(jsonString) - t.is(result, '{"name":"John Doe"}') -}) - -test('extractJSONArrayFromString should extract JSON array from string', (t) => { - const jsonString = 'Some text [1,2,3] more text' - const result = extractJSONArrayFromString(jsonString) - t.is(result, '[1,2,3]') -}) - -test('extractJSONObjectFromString should return undefined if no JSON object is found', (t) => { - const jsonString = 'Some text' - const result = extractJSONObjectFromString(jsonString) - t.is(result, undefined) -}) - -test('extractJSONArrayFromString should return undefined if no JSON array is found', (t) => { - const jsonString = 'Some text' - const result = extractJSONArrayFromString(jsonString) - t.is(result, undefined) -}) - test('sleep should delay execution', async (t) => { const start = Date.now() await sleep(1000) // for example, 1000ms / 1sec