kopia lustrzana https://github.com/saubury/mastodon-stream
Incremental dev
rodzic
688528a53f
commit
0cc14823c3
|
@ -0,0 +1,3 @@
|
|||
CONF_VER=7.1.0
|
||||
ELST_VER=7.11.0
|
||||
TZ_SET=Australia/Sydney
|
|
@ -7,3 +7,4 @@ config/mastodon-sink-s3-aws.json
|
|||
notebooks/demo.ipynb
|
||||
duckdb/init-s3.sql
|
||||
data_tmp/*
|
||||
docker-compose-orig.yml
|
||||
|
|
|
@ -25,6 +25,8 @@ python mastodonlisten.py --baseURL https://data-folks.masto.host/ --enableKafka
|
|||
|
||||
confluent-hub install confluentinc/kafka-connect-s3:10.3.0
|
||||
|
||||
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3-minio.json'
|
||||
|
||||
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3/config -d '@./config/mastodon-sink-s3.json'
|
||||
|
||||
curl -X PUT -H "Content-Type:application/json" localhost:8083/connectors/mastodon-sink-s3-aws/config -d '@./config/mastodon-sink-s3-aws.json'
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
"name": "created_at",
|
||||
"type": ["null","int"],
|
||||
"logicalType": "date",
|
||||
"default" : "null"
|
||||
"default" : null
|
||||
},
|
||||
{
|
||||
"name": "created_at_str",
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"name": "mastodon-sink-s3",
|
||||
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
|
||||
"topics": "mastodon-topic",
|
||||
"format.class": "io.confluent.connect.s3.format.parquet.ParquetFormat",
|
||||
"flush.size": "10",
|
||||
"s3.bucket.name": "mastodon",
|
||||
"aws.access.key.id": "minio",
|
||||
"aws.secret.access.key": "minio123",
|
||||
"storage.class": "io.confluent.connect.s3.storage.S3Storage",
|
||||
"store.url": "http://minio:9000"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
Plik binarny nie jest wyświetlany.
Przed Szerokość: | Wysokość: | Rozmiar: 21 KiB Po Szerokość: | Wysokość: | Rozmiar: 21 KiB |
|
@ -31,3 +31,92 @@ order by 1,2
|
|||
|
||||
-- select username, bot, count(*) from xx group by 1,2 order by 3 desc;
|
||||
|
||||
as select *
|
||||
|
||||
|
||||
|
||||
-- old backup
|
||||
create table toots
|
||||
as
|
||||
select m_id
|
||||
, created_at
|
||||
, created_at_str
|
||||
, app
|
||||
, url
|
||||
, base_url
|
||||
, language
|
||||
, favourites
|
||||
, username
|
||||
, bot
|
||||
, tags
|
||||
, characters
|
||||
, mastodon_text
|
||||
FROM read_parquet('../xx.parquet');
|
||||
|
||||
insert into toots
|
||||
select
|
||||
m_id
|
||||
, created_at
|
||||
, created_at_str
|
||||
, app
|
||||
, url
|
||||
, base_url
|
||||
, language
|
||||
, favourites
|
||||
, username
|
||||
, bot
|
||||
, tags
|
||||
, characters
|
||||
, mastodon_text
|
||||
from read_parquet('*.parquet');
|
||||
|
||||
insert into toots
|
||||
select
|
||||
m_id
|
||||
, created_at
|
||||
, created_at_str
|
||||
, app
|
||||
, url
|
||||
, base_url
|
||||
, language
|
||||
, favourites
|
||||
, username
|
||||
, bot
|
||||
, tags
|
||||
, characters
|
||||
, mastodon_text
|
||||
from read_parquet('20230213/mastodon-topic/partition=0/*.parquet');
|
||||
|
||||
|
||||
|
||||
create table all_toots
|
||||
as
|
||||
select
|
||||
m_id
|
||||
, created_at
|
||||
, app
|
||||
, url
|
||||
, base_url
|
||||
, language
|
||||
, favourites
|
||||
, username
|
||||
, bot
|
||||
, tags
|
||||
, characters
|
||||
, mastodon_text
|
||||
from toots
|
||||
group by
|
||||
m_id
|
||||
, created_at
|
||||
, app
|
||||
, url
|
||||
, base_url
|
||||
, language
|
||||
, favourites
|
||||
, username
|
||||
, bot
|
||||
, tags
|
||||
, characters
|
||||
, mastodon_text;
|
||||
|
||||
COPY all_toots TO 'all_toots.parquet' (FORMAT PARQUET);
|
|
@ -0,0 +1,6 @@
|
|||
# FROM confluentinc/cp-server-connect-base:7.3.1
|
||||
FROM confluentinc/cp-server-connect:7.1.0
|
||||
|
||||
RUN confluent-hub install --no-prompt confluentinc/kafka-connect-s3:10.3.0
|
||||
|
||||
# ENTRYPOINT ["tail", "-f", "/dev/null"]
|
File diff suppressed because one or more lines are too long
Ładowanie…
Reference in New Issue