From 6980eb7f9fe315e86936651937fd95a756c5bba1 Mon Sep 17 00:00:00 2001 From: Rohit Date: Fri, 17 May 2024 23:10:18 +0530 Subject: [PATCH] Added Movie Recommended System using ML Model --- contrib/machine-learning/DataSets.txt | 1 + .../Movie Recommended System.ipynb | 952 ++++++++++++++++++ contrib/machine-learning/index.md | 148 ++- 3 files changed, 1099 insertions(+), 2 deletions(-) create mode 100644 contrib/machine-learning/DataSets.txt create mode 100644 contrib/machine-learning/Movie Recommended System.ipynb diff --git a/contrib/machine-learning/DataSets.txt b/contrib/machine-learning/DataSets.txt new file mode 100644 index 0000000..0e1116e --- /dev/null +++ b/contrib/machine-learning/DataSets.txt @@ -0,0 +1 @@ +Data sets that are used in making this Movie Recommended System -: https://drive.google.com/drive/folders/1cHQFeg3lRb_5c8qIy2xiL-HlWqlKNsuf?usp=drive_link diff --git a/contrib/machine-learning/Movie Recommended System.ipynb b/contrib/machine-learning/Movie Recommended System.ipynb new file mode 100644 index 0000000..4cfbbaa --- /dev/null +++ b/contrib/machine-learning/Movie Recommended System.ipynb @@ -0,0 +1,952 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6beaa6b9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "movies = pd.read_csv(\"movies.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ac5fe1eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", + "
" + ], + "text/plain": [ + " movieId title \\\n", + "0 1 Toy Story (1995) \n", + "1 2 Jumanji (1995) \n", + "2 3 Grumpier Old Men (1995) \n", + "3 4 Waiting to Exhale (1995) \n", + "4 5 Father of the Bride Part II (1995) \n", + "\n", + " genres \n", + "0 Adventure|Animation|Children|Comedy|Fantasy \n", + "1 Adventure|Children|Fantasy \n", + "2 Comedy|Romance \n", + "3 Comedy|Drama|Romance \n", + "4 Comedy " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6b1ec00", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def clean_title(title):\n", + " title = re.sub(\"[^a-zA-Z0-9 ]\", \"\", title)\n", + " return title" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0fd4cf17", + "metadata": {}, + "outputs": [], + "source": [ + "movies[\"clean_title\"] = movies[\"title\"].apply(clean_title)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e69c38af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movieIdtitlegenresclean_title
01Toy Story (1995)Adventure|Animation|Children|Comedy|FantasyToy Story 1995
12Jumanji (1995)Adventure|Children|FantasyJumanji 1995
23Grumpier Old Men (1995)Comedy|RomanceGrumpier Old Men 1995
34Waiting to Exhale (1995)Comedy|Drama|RomanceWaiting to Exhale 1995
45Father of the Bride Part II (1995)ComedyFather of the Bride Part II 1995
...............
62418209157We (2018)DramaWe 2018
62419209159Window of the Soul (2001)DocumentaryWindow of the Soul 2001
62420209163Bad Poems (2018)Comedy|DramaBad Poems 2018
62421209169A Girl Thing (2001)(no genres listed)A Girl Thing 2001
62422209171Women of Devil's Island (1962)Action|Adventure|DramaWomen of Devils Island 1962
\n", + "

62423 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " movieId title \\\n", + "0 1 Toy Story (1995) \n", + "1 2 Jumanji (1995) \n", + "2 3 Grumpier Old Men (1995) \n", + "3 4 Waiting to Exhale (1995) \n", + "4 5 Father of the Bride Part II (1995) \n", + "... ... ... \n", + "62418 209157 We (2018) \n", + "62419 209159 Window of the Soul (2001) \n", + "62420 209163 Bad Poems (2018) \n", + "62421 209169 A Girl Thing (2001) \n", + "62422 209171 Women of Devil's Island (1962) \n", + "\n", + " genres \\\n", + "0 Adventure|Animation|Children|Comedy|Fantasy \n", + "1 Adventure|Children|Fantasy \n", + "2 Comedy|Romance \n", + "3 Comedy|Drama|Romance \n", + "4 Comedy \n", + "... ... \n", + "62418 Drama \n", + "62419 Documentary \n", + "62420 Comedy|Drama \n", + "62421 (no genres listed) \n", + "62422 Action|Adventure|Drama \n", + "\n", + " clean_title \n", + "0 Toy Story 1995 \n", + "1 Jumanji 1995 \n", + "2 Grumpier Old Men 1995 \n", + "3 Waiting to Exhale 1995 \n", + "4 Father of the Bride Part II 1995 \n", + "... ... \n", + "62418 We 2018 \n", + "62419 Window of the Soul 2001 \n", + "62420 Bad Poems 2018 \n", + "62421 A Girl Thing 2001 \n", + "62422 Women of Devils Island 1962 \n", + "\n", + "[62423 rows x 4 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0b2a4752", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "vectorizer = TfidfVectorizer(ngram_range=(1,2))\n", + "\n", + "tfidf = vectorizer.fit_transform(movies[\"clean_title\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "948d38ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "import numpy as np\n", + "\n", + "def search(title):\n", + " title = clean_title(title)\n", + " query_vec = vectorizer.transform([title])\n", + " similarity = cosine_similarity(query_vec, tfidf).flatten()\n", + " indices = np.argpartition(similarity, -5)[-5:]\n", + " results = movies.iloc[indices].iloc[::-1]\n", + " \n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "20445f70", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2ae4cdc198c74ae18922a2bc9fdcd1b8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Text(value='Toy Story', description='Movie Title:')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "da25b587a1c84ea487fab47ba876eeb5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "from IPython.display import display\n", + "\n", + "movie_input = widgets.Text(\n", + " value='Enter Here: ',\n", + " description='Movie Title:',\n", + " disabled=False\n", + ")\n", + "movie_list = widgets.Output()\n", + "\n", + "def on_type(data):\n", + " with movie_list:\n", + " movie_list.clear_output()\n", + " title = data[\"new\"]\n", + " if len(title) > 5:\n", + " display(search(title))\n", + "\n", + "movie_input.observe(on_type, names='value')\n", + "\n", + "\n", + "display(movie_input, movie_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2901e869", + "metadata": {}, + "outputs": [], + "source": [ + "movie_id = 89745\n", + "\n", + "#def find_similar_movies(movie_id):\n", + "movie = movies[movies[\"movieId\"] == movie_id]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7b8d1efa", + "metadata": {}, + "outputs": [], + "source": [ + "ratings = pd.read_csv(\"ratings.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "50526290", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "movieId int64\n", + "rating float64\n", + "timestamp int64\n", + "dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b27d0ba0", + "metadata": {}, + "outputs": [], + "source": [ + "similar_users = ratings[(ratings[\"movieId\"] == movie_id) & (ratings[\"rating\"] > 4)][\"userId\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7c13c44b", + "metadata": {}, + "outputs": [], + "source": [ + "similar_user_recs = ratings[(ratings[\"userId\"].isin(similar_users)) & (ratings[\"rating\"] > 4)][\"movieId\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "165057e3", + "metadata": {}, + "outputs": [], + "source": [ + "similar_user_recs = similar_user_recs.value_counts() / len(similar_users)\n", + "\n", + "similar_user_recs = similar_user_recs[similar_user_recs > .10]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b603f47e", + "metadata": {}, + "outputs": [], + "source": [ + "all_users = ratings[(ratings[\"movieId\"].isin(similar_user_recs.index)) & (ratings[\"rating\"] > 4)]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1765ebef", + "metadata": {}, + "outputs": [], + "source": [ + "all_user_recs = all_users[\"movieId\"].value_counts() / len(all_users[\"userId\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "65b35c3d", + "metadata": {}, + "outputs": [], + "source": [ + "rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)\n", + "rec_percentages.columns = [\"similar\", \"all\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3dcca5cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
similarall
897451.0000000.040459
585590.5733930.148256
593150.5306490.054931
791320.5197150.132987
25710.4966870.247010
.........
476100.1035450.022770
7800.1033800.054723
887440.1030480.010383
12580.1012260.083887
11930.1008950.120244
\n", + "

193 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " similar all\n", + "89745 1.000000 0.040459\n", + "58559 0.573393 0.148256\n", + "59315 0.530649 0.054931\n", + "79132 0.519715 0.132987\n", + "2571 0.496687 0.247010\n", + "... ... ...\n", + "47610 0.103545 0.022770\n", + "780 0.103380 0.054723\n", + "88744 0.103048 0.010383\n", + "1258 0.101226 0.083887\n", + "1193 0.100895 0.120244\n", + "\n", + "[193 rows x 2 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rec_percentages" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b84c2747", + "metadata": {}, + "outputs": [], + "source": [ + "rec_percentages[\"score\"] = rec_percentages[\"similar\"] / rec_percentages[\"all\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f2debdc7", + "metadata": {}, + "outputs": [], + "source": [ + "rec_percentages = rec_percentages.sort_values(\"score\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ac76b826", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
similarallscoremovieIdtitlegenresclean_title
170671.0000000.04045924.71636889745Avengers, The (2012)Action|Adventure|Sci-Fi|IMAXAvengers The 2012
205130.1037110.00528919.610199106072Thor: The Dark World (2013)Action|Adventure|Fantasy|IMAXThor The Dark World 2013
250580.2410540.01236719.491770122892Avengers: Age of Ultron (2015)Action|Adventure|Sci-FiAvengers Age of Ultron 2015
196780.2165340.01211917.867419102125Iron Man 3 (2013)Action|Sci-Fi|Thriller|IMAXIron Man 3 2013
167250.2150430.01205217.84307488140Captain America: The First Avenger (2011)Action|Adventure|Sci-Fi|Thriller|WarCaptain America The First Avenger 2011
163120.1754470.01014217.29982486332Thor (2011)Action|Adventure|Drama|Fantasy|IMAXThor 2011
213480.2876080.01673717.183667110102Captain America: The Winter Soldier (2014)Action|Adventure|Sci-Fi|IMAXCaptain America The Winter Soldier 2014
250710.2140490.01285616.649399122920Captain America: Civil War (2016)Action|Sci-Fi|ThrillerCaptain America Civil War 2016
250610.1360170.00857315.865628122900Ant-Man (2015)Action|Adventure|Sci-FiAntMan 2015
146280.2428760.01551715.65192177561Iron Man 2 (2010)Action|Adventure|Sci-Fi|Thriller|IMAXIron Man 2 2010
\n", + "
" + ], + "text/plain": [ + " similar all score movieId \\\n", + "17067 1.000000 0.040459 24.716368 89745 \n", + "20513 0.103711 0.005289 19.610199 106072 \n", + "25058 0.241054 0.012367 19.491770 122892 \n", + "19678 0.216534 0.012119 17.867419 102125 \n", + "16725 0.215043 0.012052 17.843074 88140 \n", + "16312 0.175447 0.010142 17.299824 86332 \n", + "21348 0.287608 0.016737 17.183667 110102 \n", + "25071 0.214049 0.012856 16.649399 122920 \n", + "25061 0.136017 0.008573 15.865628 122900 \n", + "14628 0.242876 0.015517 15.651921 77561 \n", + "\n", + " title \\\n", + "17067 Avengers, The (2012) \n", + "20513 Thor: The Dark World (2013) \n", + "25058 Avengers: Age of Ultron (2015) \n", + "19678 Iron Man 3 (2013) \n", + "16725 Captain America: The First Avenger (2011) \n", + "16312 Thor (2011) \n", + "21348 Captain America: The Winter Soldier (2014) \n", + "25071 Captain America: Civil War (2016) \n", + "25061 Ant-Man (2015) \n", + "14628 Iron Man 2 (2010) \n", + "\n", + " genres \\\n", + "17067 Action|Adventure|Sci-Fi|IMAX \n", + "20513 Action|Adventure|Fantasy|IMAX \n", + "25058 Action|Adventure|Sci-Fi \n", + "19678 Action|Sci-Fi|Thriller|IMAX \n", + "16725 Action|Adventure|Sci-Fi|Thriller|War \n", + "16312 Action|Adventure|Drama|Fantasy|IMAX \n", + "21348 Action|Adventure|Sci-Fi|IMAX \n", + "25071 Action|Sci-Fi|Thriller \n", + "25061 Action|Adventure|Sci-Fi \n", + "14628 Action|Adventure|Sci-Fi|Thriller|IMAX \n", + "\n", + " clean_title \n", + "17067 Avengers The 2012 \n", + "20513 Thor The Dark World 2013 \n", + "25058 Avengers Age of Ultron 2015 \n", + "19678 Iron Man 3 2013 \n", + "16725 Captain America The First Avenger 2011 \n", + "16312 Thor 2011 \n", + "21348 Captain America The Winter Soldier 2014 \n", + "25071 Captain America Civil War 2016 \n", + "25061 AntMan 2015 \n", + "14628 Iron Man 2 2010 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rec_percentages.head(10).merge(movies, left_index=True, right_on=\"movieId\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "479b52da", + "metadata": {}, + "outputs": [], + "source": [ + "def find_similar_movies(movie_id):\n", + " similar_users = ratings[(ratings[\"movieId\"] == movie_id) & (ratings[\"rating\"] > 4)][\"userId\"].unique()\n", + " similar_user_recs = ratings[(ratings[\"userId\"].isin(similar_users)) & (ratings[\"rating\"] > 4)][\"movieId\"]\n", + " similar_user_recs = similar_user_recs.value_counts() / len(similar_users)\n", + "\n", + " similar_user_recs = similar_user_recs[similar_user_recs > .10]\n", + " all_users = ratings[(ratings[\"movieId\"].isin(similar_user_recs.index)) & (ratings[\"rating\"] > 4)]\n", + " all_user_recs = all_users[\"movieId\"].value_counts() / len(all_users[\"userId\"].unique())\n", + " rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)\n", + " rec_percentages.columns = [\"similar\", \"all\"]\n", + " \n", + " rec_percentages[\"score\"] = rec_percentages[\"similar\"] / rec_percentages[\"all\"]\n", + " rec_percentages = rec_percentages.sort_values(\"score\", ascending=False)\n", + " return rec_percentages.head(10).merge(movies, left_index=True, right_on=\"movieId\")[[\"score\", \"title\", \"genres\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c54c6d1c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "86f82703a3894fa099d6908444af30d6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Text(value='Enter Here: ', description='Movie Title:')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "402eff788a954f6e9a9f667d48798db0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "from IPython.display import display\n", + "\n", + "movie_name_input = widgets.Text(\n", + " value='Enter Here: ',\n", + " description='Movie Title:',\n", + " disabled=False\n", + ")\n", + "recommendation_list = widgets.Output()\n", + "\n", + "def on_type(data):\n", + " with recommendation_list:\n", + " recommendation_list.clear_output()\n", + " title = data[\"new\"]\n", + " if len(title) > 5:\n", + " results = search(title)\n", + " movie_id = results.iloc[0][\"movieId\"]\n", + " display(find_similar_movies(movie_id))\n", + "\n", + "movie_name_input.observe(on_type, names='value')\n", + "\n", + "display(movie_name_input, recommendation_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f3e01c4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/contrib/machine-learning/index.md b/contrib/machine-learning/index.md index 82596a2..a04a9ec 100644 --- a/contrib/machine-learning/index.md +++ b/contrib/machine-learning/index.md @@ -1,3 +1,147 @@ -# List of sections +# Movie Recommended System Using ML in Python -- [Section title](filename.md) +To provide a structured breakdown of the concepts used in writing a movie recommendation system, we'll assume a typical example where the system involves data processing, model training, and prediction steps. Here is a comprehensive division of the concepts, assuming a basic content outline for a movie recommendation system: + +## 1. Data Collection and Preprocessing +### Importing Libraries +- **Purpose:** Use libraries to handle data manipulation, model building, and evaluation. +- **Example:** + ```python + import pandas as pd + import numpy as np + from sklearn.model_selection import train_test_split + from sklearn.metrics import mean_squared_error + ``` + +### Loading Data +- **Purpose:** Load datasets containing movie ratings, movie details, and user information. +- **Example:** + ```python + ratings = pd.read_csv('ratings.csv') + movies = pd.read_csv('movies.csv') + ``` + +### Data Cleaning +- **Purpose:** Handle missing values, duplicates, and inconsistent data. +- **Example:** + ```python + ratings.dropna(inplace=True) + ``` + +### Data Merging +- **Purpose:** Combine multiple datasets for comprehensive analysis. +- **Example:** + ```python + data = pd.merge(ratings, movies, on='movieId') + ``` + +### Exploratory Data Analysis (EDA) +- **Purpose:** Gain insights into data through visualizations and statistical analysis. +- **Example:** + ```python + import matplotlib.pyplot as plt + data['rating'].hist() + ``` + +## 2. Feature Engineering +### Encoding Categorical Variables +- **Purpose:** Convert non-numeric data into a numeric format suitable for model input. +- **Example:** + ```python + from sklearn.preprocessing import LabelEncoder + le = LabelEncoder() + data['movieId'] = le.fit_transform(data['movieId']) + data['userId'] = le.fit_transform(data['userId']) + ``` + +### Creating User-Item Matrix +- **Purpose:** Create a matrix where rows represent users and columns represent movies, with ratings as values. +- **Example:** + ```python + user_item_matrix = data.pivot(index='userId', columns='movieId', values='rating') + ``` + +## 3. Model Building +### Choosing a Model +- **Common Models:** Collaborative Filtering (User-Based, Item-Based), Matrix Factorization (SVD). +- **Example:** + ```python + from sklearn.decomposition import TruncatedSVD + ``` + +### Model Training +- **Purpose:** Train the chosen model on the dataset. +- **Example:** + ```python + svd = TruncatedSVD(n_components=50) + matrix = user_item_matrix.fillna(0) + svd.fit(matrix) + ``` + +## 4. Making Predictions +### Generating Recommendations +- **Purpose:** Use the trained model to predict ratings and recommend movies. +- **Example:** + ```python + user_ratings = svd.transform(matrix) + predicted_ratings = svd.inverse_transform(user_ratings) + ``` + +### Selecting Top Recommendations +- **Purpose:** Identify and rank the top movies for each user. +- **Example:** + ```python + def recommend_movies(user_id, num_recommendations): + user_index = user_id - 1 # assuming user_id starts from 1 + sorted_indices = np.argsort(predicted_ratings[user_index])[::-1] + top_movies = sorted_indices[:num_recommendations] + return top_movies + ``` + +## 5. Model Evaluation +### Splitting Data +- **Purpose:** Split the data into training and testing sets to evaluate model performance. +- **Example:** + ```python + train_data, test_data = train_test_split(data, test_size=0.2) + ``` + +### Evaluation Metrics +- **Common Metrics:** RMSE (Root Mean Squared Error), MAE (Mean Absolute Error). +- **Example:** + ```python + def calculate_rmse(true_ratings, predicted_ratings): + return np.sqrt(mean_squared_error(true_ratings, predicted_ratings)) + ``` + +## 6. Deployment +### Saving the Model +- **Purpose:** Save the trained model for future use. +- **Example:** + ```python + import joblib + joblib.dump(svd, 'movie_recommendation_model.pkl') + ``` + +### Loading the Model +- **Purpose:** Load the saved model to make predictions. +- **Example:** + ```python + model = joblib.load('movie_recommendation_model.pkl') + ``` + +### Creating an Interface +- **Purpose:** Build a user interface to interact with the recommendation system (e.g., web app). +- **Example:** Using Flask for a web application. + ```python + from flask import Flask, request, render_template + app = Flask(__name__) + + @app.route('/recommend', methods=['POST']) + def recommend(): + user_id = request.form['user_id'] + recommendations = recommend_movies(user_id, 10) + return render_template('recommendations.html', movies=recommendations) + ``` + +This breakdown provides a comprehensive guide to the various concepts and steps involved in building a movie recommendation system, from data preprocessing to deployment.