kopia lustrzana https://github.com/learn-awesome/learndb
modified the simplifyContent function to retain punctuation to make it more suitable for LLM analysis
rodzic
cf1833e684
commit
73443e4ce4
|
@ -22,20 +22,24 @@ function simplifyContent(content) {
|
||||||
simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, '');
|
simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, '');
|
||||||
// Remove all remaining HTML tags, leaving the inner text
|
// Remove all remaining HTML tags, leaving the inner text
|
||||||
simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, '');
|
simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, '');
|
||||||
// Decode HTML entities
|
// Decode HTML entities - for a Node.js environment, consider using a library like 'he'
|
||||||
simplifiedContent = simplifiedContent.replace(/&[a-z]+;/gi, match => {
|
simplifiedContent = simplifiedContent.replace(/&[a-z]+;/gi, match => {
|
||||||
|
// This part is for browser environments, adjust for Node.js if necessary
|
||||||
const span = document.createElement('span');
|
const span = document.createElement('span');
|
||||||
span.innerHTML = match;
|
span.innerHTML = match;
|
||||||
return span.textContent || span.innerText;
|
return span.textContent || span.innerText;
|
||||||
});
|
});
|
||||||
// Remove any residual CSS and JS (inline events, style attributes)
|
// Remove inline CSS and JavaScript event handlers
|
||||||
simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, '');
|
simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, '');
|
||||||
simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, '');
|
simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, '');
|
||||||
// Remove special characters and extra whitespace
|
// Normalize whitespace without removing sentence punctuation
|
||||||
simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, '').replace(/\s+/g, ' ').trim();
|
simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
|
||||||
return simplifiedContent.toLowerCase();
|
// Condense multiple line breaks into a single one
|
||||||
|
simplifiedContent = simplifiedContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
|
||||||
|
return simplifiedContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
|
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
|
||||||
async function performGPTAnalysis(content) {
|
async function performGPTAnalysis(content) {
|
||||||
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis
|
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis
|
||||||
|
|
Ładowanie…
Reference in New Issue