diff --git a/netlify/functions/handleMetadata.js b/netlify/functions/handleMetadata.js index 2f0242b..6d95208 100644 --- a/netlify/functions/handleMetadata.js +++ b/netlify/functions/handleMetadata.js @@ -17,19 +17,29 @@ async function fetchContentFromURL(url) { } function simplifyContent(content) { - // Keep title and body tags but remove their attributes - content = content.replace(/(.*?)<\/title>/gms, '$1'); - content = content.replace(/(.*?)<\/body>/gms, '$1'); + // Preserve the title tag and its content + let title = content.match(/(.*?)<\/title>/i); + title = title ? title[1] : ''; + + // Extract the body content, if present + let bodyContent = ''; + const bodyMatch = content.match(/([\s\S]*)<\/body>/i); + if (bodyMatch) { + bodyContent = bodyMatch[1]; + } else { + // If no body tag, assume entire content is body + bodyContent = content; + } // Remove script and style elements and their content - let simplifiedContent = content.replace(/.*?<\/script>/gms, ''); - simplifiedContent = simplifiedContent.replace(/.*?<\/style>/gms, ''); + bodyContent = bodyContent.replace(/.*?<\/script>/gms, ''); + bodyContent = bodyContent.replace(/.*?<\/style>/gms, ''); - // Remove all remaining HTML tags except for title and body, leaving the inner text - simplifiedContent = simplifiedContent.replace(/<(?!title|body)[^>]+>/g, ''); + // Remove all remaining HTML tags, except for title and body + bodyContent = bodyContent.replace(/<(?!\/?title|\/?body)([^>]+)>/g, ''); // Manually replace common HTML entities - simplifiedContent = simplifiedContent + bodyContent = bodyContent .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') @@ -37,15 +47,17 @@ function simplifyContent(content) { .replace(/'/g, "'"); // Remove inline CSS and JavaScript event handlers - simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, ''); - simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, ''); + bodyContent = bodyContent.replace(/style\s*=\s*'.*?'/gi, ''); + bodyContent = bodyContent.replace(/on\w+\s*=\s*".*?"/gi, ''); // Normalize whitespace without removing sentence punctuation - simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim(); + bodyContent = bodyContent.replace(/\s+/g, ' ').trim(); // Condense multiple line breaks into a single one - simplifiedContent = simplifiedContent.replace(/(\r\n|\r|\n){2,}/g, '\n'); + bodyContent = bodyContent.replace(/(\r\n|\r|\n){2,}/g, '\n'); + // Reconstruct content with title and body + const simplifiedContent = `${title}${bodyContent}`; return simplifiedContent; }