modified simplifyContent()

pull/73/head
Huda Joad 2023-11-25 00:03:43 +03:00
rodzic b866b6e7d0
commit 4a5bb87b0e
1 zmienionych plików z 7 dodań i 20 usunięć

Wyświetl plik

@ -1,5 +1,5 @@
const fetch = require('node-fetch'); // Import for webscraping (fetchContentFromURL(url) function
import { OpenAIApi, Configuration } from 'openai';
const fetch = require('node-fetch'); // Import for webscraping in fetchContentFromURL()
const { Configuration, OpenAIApi } = require('openai');
// Function to fetch content from URL using a web scraping service
async function fetchContentFromURL(url) {
@ -16,24 +16,11 @@ async function fetchContentFromURL(url) {
}
function simplifyContent(content) {
// Remove HTML tags
// let simplifiedContent = content.replace(/<[^>]*>/g, '');
// // Remove CSS within style tags
// simplifiedContent = simplifiedContent.replace(/<style[^>]*>.*<\/style>/gms, '');
// // Remove inline CSS and JavaScript within script tags
// simplifiedContent = simplifiedContent.replace(/<script[^>]*>.*<\/script>/gms, '');
// // Remove special characters and HTML entities
// simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, '').replace(/&[a-z]+;/gi, '');
// // Remove URLs
// simplifiedContent = simplifiedContent.replace(/https?:\/\/[^\s]+/gi, '');
// // Normalize whitespace
// simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
// // Basic language simplification
// simplifiedContent = simplifiedContent.toLowerCase();
// // Simple summarization: taking the first few sentences
// const sentences = simplifiedContent.split('. ');
// const summarizedContent = sentences.slice(0, Math.min(5, sentences.length)).join('. ');
simplifiedContent = "hello maria";
let simplifiedContent = content.replace(/<[^>]*>/g, ''); // Remove HTML tags
simplifiedContent = simplifiedContent.replace(/<style[^>]*>.*<\/style>/gms, ''); // Remove CSS
simplifiedContent = simplifiedContent.replace(/<script[^>]*>.*<\/script>/gms, ''); // Remove JS
simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, ''); // Remove special characters
simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim(); // Normalize whitespace
return simplifiedContent;
}