learndb/netlify/functions/handleMetadata.js

116 wiersze
4.6 KiB
JavaScript

const https = require('https'); // Import for webscraping (fetchContentFromURL(url) function
// const fetch = require('node-fetch');
const { OpenAIApi, Configuration } = require('openai');
// Placeholder function to fetch content from URL using a web scraping service
async function fetchContentFromURL(url) {
return new Promise((resolve, reject) => {
https.get(url, (response) => {
let data = '';
// A chunk of data has been received.
response.on('data', (chunk) => {
data += chunk;
});
// The whole response has been received.
response.on('end', () => {
resolve(data);
});
}).on("error", (error) => {
reject(`Error fetching URL: ${error.message}`);
});
});
}
function simplifyContent(content) {
// Remove HTML tags
let simplifiedContent = content.replace(/<[^>]*>/g, '');
// Remove CSS within style tags
simplifiedContent = simplifiedContent.replace(/<style[^>]*>.*<\/style>/gms, '');
// Remove inline CSS and JavaScript within script tags
simplifiedContent = simplifiedContent.replace(/<script[^>]*>.*<\/script>/gms, '');
// Remove special characters and HTML entities
simplifiedContent = simplifiedContent.replace(/[^\w\s]/gi, '').replace(/&[a-z]+;/gi, '');
// Remove URLs
simplifiedContent = simplifiedContent.replace(/https?:\/\/[^\s]+/gi, '');
// Normalize whitespace
simplifiedContent = simplifiedContent.replace(/\s+/g, ' ').trim();
// Basic language simplification
simplifiedContent = simplifiedContent.toLowerCase();
// // Simple summarization: taking the first few sentences
// const sentences = simplifiedContent.split('. ');
// const summarizedContent = sentences.slice(0, Math.min(5, sentences.length)).join('. ');
return simplifiedContent;
}
// Placeholder function to perform GPT analysis for media type and topics using Mistral-7b via OpenRouter
async function performGPTAnalysis(content) {
// Implement logic to send content to Mistral-7b via OpenRouter for GPT analysis
// Send content and receive GPT analysis response
// Placeholder code
const inferredMediaType = "article";
const extractedTopics = ["topic1", "topic2"];
return { inferredMediaType, extractedTopics };
}
// Placeholder function to map inferred values to predefined formats and topics
function mapInferredValues(mediaType, topics) {
// Implement logic to map inferred media type and topics to predefined formats and topics
// Match inferred values with predefined taxonomy
// Placeholder code
const predefinedMediaType = "Article";
const predefinedTopics = ["Topic 1", "Topic 2"];
return { predefinedMediaType, predefinedTopics };
}
// Placeholder function to format the response
function formatResponse(predefinedMediaType, predefinedTopics) {
// Implement logic to format the extracted metadata into the desired response structure
// Construct the response object
// Placeholder code
const response = {
format: predefinedMediaType,
topics: predefinedTopics,
};
return response;
}
export async function handler(event) {
try {
// Extract URL and API Key from the request body
const { url, apiKey } = JSON.parse(event.body);
// Validate if URL and API Key are present
if (!url || !apiKey) {
return {
statusCode: 400,
body: JSON.stringify({ error: 'URL and API Key are required' }),
};
}
// Step 1: Fetch content from the URL using a web scraping service
const fetchedContent = await fetchContentFromURL(url);
// Step 2: Simplify the fetched content for GPT analysis
const simplifiedContent = simplifyContent(fetchedContent);
// Step 3: Perform GPT analysis for media type and topics
const { inferredMediaType, extractedTopics } = await performGPTAnalysis(simplifiedContent);
// Step 4: Map inferred values to predefined formats and topics
const { predefinedMediaType, predefinedTopics } = mapInferredValues(inferredMediaType, extractedTopics);
// Step 5: Format the response
const formattedResponse = formatResponse(predefinedMediaType, predefinedTopics);
// Return the formatted response
return {
statusCode: 200,
body: JSON.stringify(simplifiedContent),
};
} catch (error) {
return {
statusCode: 500,
body: JSON.stringify({ error: 'Something went wrong' }),
};
}
}