modified the simplifyContent function to retain punctuation to make it more suitable for LLM analysis

pull/73/head
Huda Joad 2023-11-25 00:21:55 +03:00
rodzic cf1833e684
commit 73443e4ce4
1 zmienionych plików z 9 dodań i 5 usunięć

Wyświetl plik

@ -22,20 +22,24 @@ function simplifyContent(content) {
simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, ''); simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, '');
// Remove all remaining HTML tags, leaving the inner text // Remove all remaining HTML tags, leaving the inner text
simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, ''); simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, '');
// Decode HTML entities // Decode HTML entities - for a Node.js environment, consider using a library like 'he'
simplifiedContent = simplifiedContent.replace(/&[a-z]+;/gi, match => { simplifiedContent = simplifiedContent.replace(/&[a-z]+;/gi, match => {
// This part is for browser environments, adjust for Node.js if necessary
const span = document.createElement('span'); const span = document.createElement('span');
span.innerHTML = match; span.innerHTML = match;
return span.textContent || span.innerText; return span.textContent || span.innerText;
}); });
// Remove any residual CSS and JS (inline events, style attributes) // Remove inline CSS and JavaScript event handlers
simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, ''); simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, '');
simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, ''); simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, '');
// Remove special characters and extra whitespace // Normalize whitespace without removing sentence punctuation
simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, '').replace(/\s+/g, ' ').trim(); simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
return simplifiedContent.toLowerCase(); // Condense multiple line breaks into a single one
simplifiedContent = simplifiedContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
return simplifiedContent;
} }
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter // Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
async function performGPTAnalysis(content) { async function performGPTAnalysis(content) {
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis // Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis