Porównaj commity

...

5 Commity

Autor SHA1 Wiadomość Data
Huda Joad ded050d616 slightly modified simplifyContent() to preserve more tags 2023-12-04 23:57:46 +03:00
Huda Joad d9735112c5 preserve body tag 2023-12-04 23:48:24 +03:00
Huda Joad 7de9f17ec3 preserve body tag 2023-12-04 23:48:11 +03:00
Huda Joad 5b3c6578aa modified simplifyContent() 2023-12-04 23:43:59 +03:00
Huda Joad 9d8827ab52 changed the function a little to keep the tags 2023-12-04 23:38:18 +03:00
1 zmienionych plików z 31 dodań i 12 usunięć

Wyświetl plik

@ -17,31 +17,50 @@ async function fetchContentFromURL(url) {
}
function simplifyContent(content) {
// Denote headings with a marker (like '###') and add a line break
content = content.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/g, '\n### $1\n');
// Preserve the title tag and its content
let title = content.match(/<title.*?>(.*?)<\/title>/i);
title = title ? title[1] : '';
// Extract the body content, if present
let bodyContent = '';
const bodyMatch = content.match(/<body.*?>([\s\S]*)<\/body>/i);
if (bodyMatch) {
bodyContent = bodyMatch[1];
} else {
// If no body tag, assume entire content is body
bodyContent = content;
}
// Remove script and style elements and their content
let simplifiedContent = content.replace(/<script.*?>.*?<\/script>/gms, '');
simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, '');
// Remove all remaining HTML tags, leaving the inner text
simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, '');
bodyContent = bodyContent.replace(/<script.*?>.*?<\/script>/gms, '');
bodyContent = bodyContent.replace(/<style.*?>.*?<\/style>/gms, '');
// Remove all remaining HTML tags, except for title, body, h1-h6, p, and a
bodyContent = bodyContent.replace(/<(?!\/?(title|body|h[1-6]|p|a)( [^>]*)?>)([^>]+)>/g, '');
// Manually replace common HTML entities
simplifiedContent = simplifiedContent
bodyContent = bodyContent
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Remove inline CSS and JavaScript event handlers
simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, '');
simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, '');
bodyContent = bodyContent.replace(/style\s*=\s*'.*?'/gi, '');
bodyContent = bodyContent.replace(/on\w+\s*=\s*".*?"/gi, '');
// Normalize whitespace without removing sentence punctuation
simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
bodyContent = bodyContent.replace(/\s+/g, ' ').trim();
// Condense multiple line breaks into a single one
simplifiedContent = simplifiedContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
bodyContent = bodyContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
// Reconstruct content with title and body
const simplifiedContent = `<title>${title}</title><body>${bodyContent}</body>`;
return simplifiedContent;
}
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
async function performGPTAnalysis(simplifiedContent, apiKey) {
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis