From 209f2ecb832c5ad9767868f4e36db3dee1b5e76e Mon Sep 17 00:00:00 2001 From: Simon Aubury Date: Thu, 6 Apr 2023 12:25:46 +1000 Subject: [PATCH] Similpief demo for meetup --- docker-compose-demo.yml | 31 +++ notebooks/mastodon-analysis-demo.ipynb | 372 +++++++++++++++++++++++++ 2 files changed, 403 insertions(+) create mode 100644 docker-compose-demo.yml create mode 100644 notebooks/mastodon-analysis-demo.ipynb diff --git a/docker-compose-demo.yml b/docker-compose-demo.yml new file mode 100644 index 0000000..9ab3c37 --- /dev/null +++ b/docker-compose-demo.yml @@ -0,0 +1,31 @@ +--- +version: '3.9' +services: + minio: + hostname: minio + image: 'minio/minio:latest' + container_name: minio + ports: + - "9001:9001" + - "9000:9000" + command: [ "server", "/data", "--console-address", ":9001" ] + volumes: + - ./data:/data + environment: + MINIO_ROOT_USER: minio + MINIO_ROOT_PASSWORD: minio123 + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/mastodon; + /usr/bin/mc mb minio/mastodon; + /usr/bin/mc policy set public minio/mastodon; + exit 0; + " diff --git a/notebooks/mastodon-analysis-demo.ipynb b/notebooks/mastodon-analysis-demo.ipynb new file mode 100644 index 0000000..3a2f975 --- /dev/null +++ b/notebooks/mastodon-analysis-demo.ipynb @@ -0,0 +1,372 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mastodon analysis\n", + "Expected to be run in a notebook\n", + "\n", + "DuckDB’s Python client can be used [directly in Jupyter notebook](https://duckdb.org/docs/guides/python/jupyter)\n", + "\n", + "First step is import the relevant librariesSet and configure to directly output data to Pandas and to simplify the output that is printed to the notebook.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import duckdb\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "%load_ext sql\n", + "%sql duckdb:///:memory:\n", + "\n", + "%config SqlMagic.autopandas = True\n", + "%config SqlMagic.feedback = False\n", + "%config SqlMagic.displaycon = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load [HTTPFS DuckDB extension](https://duckdb.org/docs/extensions/httpfs.html) for reading remote/writing remote files of object storage using the S3 API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "INSTALL httpfs;\n", + "LOAD httpfs;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Establish s3 endpoint\n", + "Set the s3 endpoint settings. Here we're using a local [MinIO](https://min.io/) as an Open Source, Amazon S3 compatible server" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "set s3_endpoint='localhost:9000';\n", + "set s3_access_key_id='minio';\n", + "set s3_secret_access_key='minio123';\n", + "set s3_use_ssl=false;\n", + "set s3_region='us-east-1';\n", + "set s3_url_style='path';" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you can now query the parquet files directly from s3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select *\n", + "from read_parquet('s3://mastodon/a*.parquet');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DuckDB SQL to process Mastodon activity\n", + "Run SQL\n", + "- cleanup any existing termporary tables\n", + "- create empty `language` lookup table and load languages from [language.csv](../duckdb/language.csv)\n", + "- create `mastodon_toot_raw` table by loading romote parquet files (from s3). Note the `created_at` timestamp is calculated as number of seconds from epoc\n", + "- final table `mastodon_toot` is a join of `mastodon_toot_raw` to `language`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "drop table if exists mastodon_toot_raw;\n", + "drop table if exists mastodon_toot;\n", + "drop table if exists language;\n", + "\n", + "CREATE TABLE language(lang_iso VARCHAR PRIMARY KEY, language_name VARCHAR);\n", + "\n", + "insert into language\n", + "select *\n", + "from read_csv('./language.csv', AUTO_DETECT=TRUE, header=True);\n", + "\n", + "create table mastodon_toot_raw as\n", + "select m_id\n", + ", created_at, ('EPOCH'::TIMESTAMP + INTERVAL (created_at::INT) seconds)::TIMESTAMPTZ as created_tz\n", + ", app\n", + ", url\n", + ", regexp_replace(regexp_replace(url, '^http[s]://', ''), '/.*$', '') as from_instance\n", + ", base_url\n", + ", language\n", + ", favourites\n", + ", username\n", + ", bot\n", + ", tags\n", + ", characters\n", + ", mastodon_text\n", + "from read_parquet('s3://mastodon/a*.parquet');\n", + "\n", + "create table mastodon_toot as\n", + "select mr.*, ln.language_name\n", + "from mastodon_toot_raw mr \n", + "left outer join language ln on (mr.language = ln.lang_iso);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Daily Mastodon usage\n", + "\n", + "We can query the `mastodon_toot` table directly to see the number of _toots_, _users_ each day by counting and grouping the activity by the day\n", + "\n", + "We can use the [mode](https://duckdb.org/docs/sql/aggregates.html#statistical-aggregates) aggregtae function to find the most frequent \"bot\" and \"not-bot\" users to find the most active Mastodon users\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select strftime(created_tz, '%Y/%m/%d %a') as \"Created day\"\n", + ", count(*) as \"Num toots\"\n", + ", count(distinct(username)) as \"Num users\"\n", + ", count(distinct(from_instance)) as \"Num urls\"\n", + ", mode(case when bot='False' then username end) as \"Most freq non-bot\"\n", + ", mode(case when bot='True' then username end) as \"Most freq bot\"\n", + ", mode(base_url) as \"Most freq host\"\n", + "from mastodon_toot\n", + "group by 1\n", + "order by 1\n", + ";" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Mastodon app landscape\n", + "What clients are used to access mastodon instances\n", + "\n", + "We take the query the `mastodon_toot` table, excluding \"bots\" and load query results into the `mastodon_app_df` Panda dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "mastodon_app_df << \n", + " select *\n", + " from mastodon_toot\n", + " where app is not null \n", + " and app <> ''\n", + " and bot='False';" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Seaborn](https://seaborn.pydata.org/) is a visualization library for statistical graphics in Python, built on the top of [matplotlib](https://matplotlib.org/). It also works really well with Panda data structures.\n", + "\n", + "\n", + "We can use [seaborn.countplot](https://seaborn.pydata.org/generated/seaborn.countplot.html) to show the counts of Mastodon app usage observations in each categorical bin using bars. Note, we are limiting this to the 10 highest occurances by specifying `mastodon_app_df.app.value_counts().iloc[:10]`\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.countplot(data=mastodon_app_df, y=\"app\", order=mastodon_app_df.app.value_counts().iloc[:10].index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Time of day Mastodon usage\n", + "Let's see when Mastodon is used throughout the day and night. I want to get a raw hourly cound of _toots_ each hour of each day. We can load the results of this query into the `mastodon_usage_df` dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "mastodon_usage_df << \n", + " select strftime(created_tz, '%Y/%m/%d %a') as created_day\n", + " , date_part('hour', created_tz) as created_hour\n", + " , count(*) as num\n", + " from mastodon_toot\n", + " group by 1,2 \n", + " order by 1,2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Language usage\n", + "A wildly inaccurate investigation of language tags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "mastodon_usage_df << \n", + " select *\n", + " from mastodon_toot;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.countplot(data=mastodon_usage_df, y=\"language_name\", order=mastodon_usage_df.language_name.value_counts().iloc[:20].index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "mastodon_lang_df << \n", + " select *\n", + " from mastodon_toot\n", + " where characters < 200\n", + " and language not in ('unknown');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sns.boxplot(data=mastodon_lang_df, x=\"characters\", y=\"language_name\", whis=100, orient=\"h\", order=mastodon_lang_df.language_name.value_counts().iloc[:20].index)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trending topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Random stuff\n", + "\n", + "How frequently do _toots_ mention topical concepts such as the _superbowl_, _balloons_ or _ChatGPT_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "select strftime(created_tz, '%Y/%m/%d %a') as \"Created day\"\n", + ", count(*) as \"Num toots\"\n", + ", sum(case when mastodon_text ilike '%balloon%' then 1 else 0 end) as cnt_balloon\n", + ", sum(case when mastodon_text ilike '%earthquake%' then 1 else 0 end) as cnt_earthquake\n", + ", sum(case when mastodon_text ilike '%superbowl%' then 1 else 0 end) as cnt_superbowl\n", + ", sum(case when mastodon_text ilike '%chatgpt%' then 1 else 0 end) as cnt_chatgpt\n", + "from mastodon_toot\n", + "where created_tz between TIMESTAMP '2023-02-07 13:00:00' and TIMESTAMP '2023-02-18 12:59:59' \n", + "group by 1\n", + "order by 1\n", + ";" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 ('env': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "9031e1068205800656601d4369604809c22502cb495e0a99201febf8414f4b9f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}