kopia lustrzana https://github.com/learn-awesome/learndb
Porównaj commity
5 Commity
186d6c22dd
...
ded050d616
Autor | SHA1 | Data |
---|---|---|
Huda Joad | ded050d616 | |
Huda Joad | d9735112c5 | |
Huda Joad | 7de9f17ec3 | |
Huda Joad | 5b3c6578aa | |
Huda Joad | 9d8827ab52 |
|
@ -17,31 +17,50 @@ async function fetchContentFromURL(url) {
|
|||
}
|
||||
|
||||
function simplifyContent(content) {
|
||||
// Denote headings with a marker (like '###') and add a line break
|
||||
content = content.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/g, '\n### $1\n');
|
||||
// Preserve the title tag and its content
|
||||
let title = content.match(/<title.*?>(.*?)<\/title>/i);
|
||||
title = title ? title[1] : '';
|
||||
|
||||
// Extract the body content, if present
|
||||
let bodyContent = '';
|
||||
const bodyMatch = content.match(/<body.*?>([\s\S]*)<\/body>/i);
|
||||
if (bodyMatch) {
|
||||
bodyContent = bodyMatch[1];
|
||||
} else {
|
||||
// If no body tag, assume entire content is body
|
||||
bodyContent = content;
|
||||
}
|
||||
|
||||
// Remove script and style elements and their content
|
||||
let simplifiedContent = content.replace(/<script.*?>.*?<\/script>/gms, '');
|
||||
simplifiedContent = simplifiedContent.replace(/<style.*?>.*?<\/style>/gms, '');
|
||||
// Remove all remaining HTML tags, leaving the inner text
|
||||
simplifiedContent = simplifiedContent.replace(/<[^>]+>/g, '');
|
||||
bodyContent = bodyContent.replace(/<script.*?>.*?<\/script>/gms, '');
|
||||
bodyContent = bodyContent.replace(/<style.*?>.*?<\/style>/gms, '');
|
||||
|
||||
// Remove all remaining HTML tags, except for title, body, h1-h6, p, and a
|
||||
bodyContent = bodyContent.replace(/<(?!\/?(title|body|h[1-6]|p|a)( [^>]*)?>)([^>]+)>/g, '');
|
||||
|
||||
// Manually replace common HTML entities
|
||||
simplifiedContent = simplifiedContent
|
||||
bodyContent = bodyContent
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
// Remove inline CSS and JavaScript event handlers
|
||||
simplifiedContent = simplifiedContent.replace(/style\s*=\s*'.*?'/gi, '');
|
||||
simplifiedContent = simplifiedContent.replace(/on\w+\s*=\s*".*?"/gi, '');
|
||||
bodyContent = bodyContent.replace(/style\s*=\s*'.*?'/gi, '');
|
||||
bodyContent = bodyContent.replace(/on\w+\s*=\s*".*?"/gi, '');
|
||||
|
||||
// Normalize whitespace without removing sentence punctuation
|
||||
simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
|
||||
bodyContent = bodyContent.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Condense multiple line breaks into a single one
|
||||
simplifiedContent = simplifiedContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
|
||||
bodyContent = bodyContent.replace(/(\r\n|\r|\n){2,}/g, '\n');
|
||||
|
||||
// Reconstruct content with title and body
|
||||
const simplifiedContent = `<title>${title}</title><body>${bodyContent}</body>`;
|
||||
return simplifiedContent;
|
||||
}
|
||||
|
||||
|
||||
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
|
||||
async function performGPTAnalysis(simplifiedContent, apiKey) {
|
||||
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis
|
||||
|
|
Ładowanie…
Reference in New Issue