From cf1833e6846dfbf7ffa7d2b046ac429c8a2ddb09 Mon Sep 17 00:00:00 2001 From: Huda Joad Date: Sat, 25 Nov 2023 00:15:52 +0300 Subject: [PATCH] modified simplifyContent to make it cover more html --- netlify/functions/handleMetadata.js | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/netlify/functions/handleMetadata.js b/netlify/functions/handleMetadata.js index cb34a09..e622357 100644 --- a/netlify/functions/handleMetadata.js +++ b/netlify/functions/handleMetadata.js @@ -17,12 +17,23 @@ async function fetchContentFromURL(url) { } function simplifyContent(content) { - let simplifiedContent = content.replace(/<[^>]*>/g, ''); // Remove HTML tags - simplifiedContent = simplifiedContent.replace(/]*>.*<\/style>/gms, ''); // Remove CSS - simplifiedContent = simplifiedContent.replace(/]*>.*<\/script>/gms, ''); // Remove JS - simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, ''); // Remove special characters - simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim(); // Normalize whitespace - return simplifiedContent; + // Remove script and style elements and their content + let simplifiedContent = content.replace(/.*?<\/script>/gms, ''); + simplifiedContent = simplifiedContent.replace(/.*?<\/style>/gms, ''); + // Remove all remaining HTML tags, leaving the inner text + simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, ''); + // Decode HTML entities + simplifiedContent = simplifiedContent.replace(/&[a-z]+;/gi, match => { + const span = document.createElement('span'); + span.innerHTML = match; + return span.textContent || span.innerText; + }); + // Remove any residual CSS and JS (inline events, style attributes) + simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, ''); + simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, ''); + // Remove special characters and extra whitespace + simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, '').replace(/\s+/g, ' ').trim(); + return simplifiedContent.toLowerCase(); } // Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter