diff --git a/README.md b/README.md index 67c2419..31ece5c 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ kafka-avro-console-consumer --bootstrap-server localhost:9092 --topic mastodon-t curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3-minio.json' ``` +# Open s3 browser +http://localhost:9001/ # Kafka Connect OLD diff --git a/docker-compose.yml b/docker-compose.yml index 5e739c2..c02289f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -178,23 +178,23 @@ services: exit 0; " - jupyter: - image: jupyter/scipy-notebook - ports: - - "8888:8888" - healthcheck: - test: nc -z localhost 8888 || exit -1 - start_period: 15s - interval: 5s - timeout: 10s - retries: 10 - volumes: - - ./notebooks:/home/jovyan/ - user: root - environment: - JUPYTER_ENABLE_LAB: "yes" - JUPYTER_RUNTIME_DIR: "/tmp" - NB_USER: simonaubury - CHOWN_HOME: 'yes' - CHOWN_HOME_OPTS: '-R' - command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''" + # jupyter: + # image: jupyter/scipy-notebook + # ports: + # - "8888:8888" + # healthcheck: + # test: nc -z localhost 8888 || exit -1 + # start_period: 15s + # interval: 5s + # timeout: 10s + # retries: 10 + # volumes: + # - ./notebooks:/home/jovyan/ + # user: root + # environment: + # JUPYTER_ENABLE_LAB: "yes" + # JUPYTER_RUNTIME_DIR: "/tmp" + # NB_USER: simonaubury + # CHOWN_HOME: 'yes' + # CHOWN_HOME_OPTS: '-R' + # command: "start-notebook.sh --allow-root --ip=0.0.0.0 --NotebookApp.token='' --NotebookApp.password=''" diff --git a/notebooks/mastodon-analysis.ipynb b/notebooks/mastodon-analysis.ipynb index 73dc507..7305230 100644 --- a/notebooks/mastodon-analysis.ipynb +++ b/notebooks/mastodon-analysis.ipynb @@ -49,6 +49,14 @@ "LOAD httpfs;" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Establish s3 endpoint\n", + "Set the s3 endpoint settings. Here we're using a local [MinIO](https://min.io/) as an Open Source, Amazon S3 compatible server" + ] + }, { "cell_type": "code", "execution_count": null, @@ -64,6 +72,13 @@ "set s3_url_style='path';" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you can now query the parquet files directly from s3" + ] + }, { "cell_type": "code", "execution_count": null, @@ -118,11 +133,12 @@ ", tags\n", ", characters\n", ", mastodon_text\n", - "FROM read_parquet('../data_tmp/all_toots.parquet');\n", + "from read_parquet('s3://mastodon/topics/mastodon-topic/partition=0/*');\n", "\n", "create table mastodon_toot as\n", "select mr.*, ln.language_name\n", - "from mastodon_toot_raw mr left outer join language ln on (mr.language = ln.lang_iso);" + "from mastodon_toot_raw mr \n", + "left outer join language ln on (mr.language = ln.lang_iso);" ] }, { @@ -152,7 +168,6 @@ ", mode(case when bot='True' then username end) as \"Most freq bot\"\n", ", mode(base_url) as \"Most freq host\"\n", "from mastodon_toot\n", - "where created_tz between TIMESTAMP '2023-02-07 13:00:00' and TIMESTAMP '2023-02-18 12:59:59' \n", "group by 1\n", "order by 1\n", ";" @@ -235,9 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "# sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\")\n", - "df = mastodon_usage_df[mastodon_usage_df['created_day'].isin(['2023/02/04 Sat', '2023/02/05 Sun', '2023/02/11 Sat', '2023/02/12 Sun', '2023/02/15 Wed', '2023/02/16 Thu', '2023/02/17 Fri'] ) ]\n", - "sns.lineplot(data=df , x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))" + "sns.lineplot(data=mastodon_usage_df, x=\"created_hour\", y=\"num\", hue=\"created_day\").set_xticks(range(24))" ] }, { @@ -277,10 +290,10 @@ "source": [ "%%sql\n", "mastodon_lang_df << \n", - "select *\n", - "from mastodon_toot\n", - "where characters < 200\n", - "and language not in ('unknown');" + " select *\n", + " from mastodon_toot\n", + " where characters < 200\n", + " and language not in ('unknown');" ] }, { diff --git a/requirements.txt b/requirements.txt index 61bab14..4bada0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,108 @@ -Mastodon.py -BeautifulSoup4 -confluent_kafka -avro +anyio==3.6.2 +appnope==0.1.3 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +arrow==1.2.3 +asttokens==2.2.1 +attrs==22.2.0 +avro==1.11.1 +backcall==0.2.0 +beautifulsoup4==4.11.1 +bleach==6.0.0 +blurhash==1.1.4 +certifi==2022.12.7 +cffi==1.15.1 +chardet==5.1.0 +charset-normalizer==3.0.1 +comm==0.1.2 +confluent-kafka==2.0.2 +contourpy==1.0.7 +cycler==0.11.0 +debugpy==1.6.6 +decorator==5.1.1 +defusedxml==0.7.1 +duckdb==0.6.1 +duckdb-engine==0.6.8 +executing==1.2.0 +fastavro==1.7.1 +fastjsonschema==2.16.2 +fonttools==4.38.0 +fqdn==1.5.1 +idna==3.4 +importlib-metadata==6.0.0 +ipykernel==6.21.1 +ipython==8.9.0 +ipython-genutils==0.2.0 +ipython-sql==0.4.1 +isoduration==20.11.0 +jedi==0.18.2 +Jinja2==3.1.2 +jsonpointer==2.3 +jsonschema==4.17.3 +jupyter-events==0.6.3 +jupyter_client==8.0.2 +jupyter_core==5.2.0 +jupyter_server==2.2.1 +jupyter_server_terminals==0.4.4 +jupyterlab-pygments==0.2.2 +kiwisolver==1.4.4 +MarkupSafe==2.1.2 +Mastodon.py==1.8.0 +matplotlib==3.6.3 +matplotlib-inline==0.1.6 +mistune==2.0.4 +nbclassic==0.5.1 +nbclient==0.7.2 +nbconvert==7.2.9 +nbformat==5.7.3 +nest-asyncio==1.5.6 +notebook==6.5.2 +notebook_shim==0.2.2 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +pandocfilters==1.5.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.4.0 +platformdirs==2.6.2 +prettytable==0.7.2 +prometheus-client==0.16.0 +prompt-toolkit==3.0.36 +psutil==5.9.4 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.14.0 +pyparsing==3.0.9 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +python-json-logger==2.0.4 +python-magic==0.4.27 +pytz==2022.7.1 +PyYAML==6.0 +pyzmq==25.0.0 +requests==2.28.2 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +seaborn==0.12.2 +Send2Trash==1.8.0 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.3.2.post1 +SQLAlchemy==1.4.46 +sqlparse==0.4.3 +stack-data==0.6.2 +terminado==0.17.1 +timer==0.2.2 +tinycss2==1.2.1 +tornado==6.2 +traitlets==5.9.0 +uri-template==1.2.0 +urllib3==1.26.14 +wcwidth==0.2.6 +webcolors==1.12 +webencodings==0.5.1 +websocket-client==1.5.1 +zipp==3.12.1 diff --git a/xx.parquet b/xx.parquet deleted file mode 100644 index efebce6..0000000 Binary files a/xx.parquet and /dev/null differ