Jonathan Seltmann 2024-06-29 12:03:32 +02:00
commit e43f985d1a
1 changed files with 32 additions and 30 deletions

View File

@ -9,23 +9,23 @@ Durch das Importieren der Daten kann der Startup bis zu 30 Minuten dauern.
## Cassandra ## Cassandra
Command can be run on the Cassandra Web at `localhost:3000` or in the docker container with: Die Befehle können in Cassandra Web mit der Adresse `localhost:3000` oder im Docker Container mit:
sudo docker exec -it cass1 cqlsh sudo docker exec -it cass1 cqlsh
Note the `MATERIALZIED VIEWS` and `CUSTOM INDEX` are already created in the [startup script](https://github.com/Miracle-Fruit/kikeriki/blob/main/cassandra/startup/setup/setup_db.sh) and don't needed to be run again. Hinweis: Die `MATERIALZIED VIEWS` und `CUSTOM INDEX` sind bereits im [Startup-Skript](./startup/setup/setup_db.sh) erstellt und müssen nicht erneut ausgeführt werden.
## Queries ## Queries
All queries can also be found [here](https://github.com/Miracle-Fruit/kikeriki/tree/main/cassandra/startup/queries). Alle Queries finden Sie auch [hier](./startup/queries).
1. Listing all the posts made by an account. 1. Auflisten der Posts, die von einem Account gemacht wurden, bzw. ihm zugeordnet wurden
SELECT content FROM twitter.tweets where author_id = 233248636; SELECT content FROM twitter.tweets where author_id = 233248636;
![result_ex1](img/ex1.png) ![result_ex1](img/ex1.png)
2. Find the 100 accounts with the most followers 2. Finden der 100 Accounts mit den meisten Followern
CREATE MATERIALIZED VIEW twitter.most_follows AS CREATE MATERIALIZED VIEW twitter.most_follows AS
SELECT user_id, follower_len from twitter.user_stats SELECT user_id, follower_len from twitter.user_stats
@ -38,7 +38,7 @@ All queries can also be found [here](https://github.com/Miracle-Fruit/kikeriki/t
Top 100 most followed accounts: [817268, 173732041, 14506809, 158804228, 358775055, 45416789, 302282272, 65913144, 261001122, 14246001, 7702232, 101204352, 280365428, 15439395, 26929220, 46537966, 32774989, 2367911, 7429892, 88097807, 274153775, 41147062, 127973392, 10350, 12127832, 18666844, 279787626, 14511951, 116036694, 225784456, 270449528, 63796828, 22784458, 364917755, 15693493, 17346342, 28933226, 7872262, 14180231, 100581193, 88323281, 131029775, 25952851, 14692604, 21681252, 309366491, 25376226, 22705686, 188108667, 18715024, 3829151, 41172837, 92319025, 24081780, 15102849, 83943787, 15680204, 7846, 184097849, 7377812, 204317520, 24641194, 14691709, 112939321, 7081402, 153226312, 14269220, 94414805, 108811740, 1065921, 221829166, 116952434, 15222083, 6608332, 17759701, 16098603, 7860742, 20471349, 6519522, 19040580, 14536491, 197504076, 12611642, 36198161, 3840, 22841103, 440963134, 20273398, 17092592, 13348, 17093617, 22679419, 208132323, 18776017, 15846407, 18581803, 5442012, 813286, 3359851, 59804598] Top 100 most followed accounts: [817268, 173732041, 14506809, 158804228, 358775055, 45416789, 302282272, 65913144, 261001122, 14246001, 7702232, 101204352, 280365428, 15439395, 26929220, 46537966, 32774989, 2367911, 7429892, 88097807, 274153775, 41147062, 127973392, 10350, 12127832, 18666844, 279787626, 14511951, 116036694, 225784456, 270449528, 63796828, 22784458, 364917755, 15693493, 17346342, 28933226, 7872262, 14180231, 100581193, 88323281, 131029775, 25952851, 14692604, 21681252, 309366491, 25376226, 22705686, 188108667, 18715024, 3829151, 41172837, 92319025, 24081780, 15102849, 83943787, 15680204, 7846, 184097849, 7377812, 204317520, 24641194, 14691709, 112939321, 7081402, 153226312, 14269220, 94414805, 108811740, 1065921, 221829166, 116952434, 15222083, 6608332, 17759701, 16098603, 7860742, 20471349, 6519522, 19040580, 14536491, 197504076, 12611642, 36198161, 3840, 22841103, 440963134, 20273398, 17092592, 13348, 17093617, 22679419, 208132323, 18776017, 15846407, 18581803, 5442012, 813286, 3359851, 59804598]
``` ```
3. Finding the 100 accounts that follow the most of the accounts found in 2). 3. Finden der 100 Accounts, die den meisten der Accounts folgen, die in 2. gefunden wurden
```python ```python
followed_accs = session.execute(f'SELECT follower_id FROM twitter.follower_relation WHERE user_id IN {tuple(most_follows)};') followed_accs = session.execute(f'SELECT follower_id FROM twitter.follower_relation WHERE user_id IN {tuple(most_follows)};')
_list = list() _list = list()
@ -51,18 +51,18 @@ print("100 accounts that follow the most of the accounts found in 2)",followed_t
100 accounts that follow the most of the accounts found in 2) [(3359851, 47), (7860742, 47), (15913, 45), (7861312, 45), (16098603, 43), (18776017, 41), (10350, 39), (48485771, 39), (22679419, 38), (3443591, 37), (18581803, 37), (18927441, 37), (26281970, 37), (7872262, 36), (24742040, 36), (5442012, 36), (10671602, 35), (11928542, 35), (14922225, 35), (16453996, 35), (21681252, 35), (14589257, 34), (15853668, 34), (9451052, 34), (65913144, 34), (14269220, 34), (15234657, 34), (17092592, 34), (59804598, 34), (87764480, 34), (40981798, 34), (16475194, 34), (16464746, 33), (93905958, 33), (20152005, 33), (42361118, 33), (36629388, 33), (18666844, 32), (19413393, 32), (12127832, 32), (14180231, 32), (14983833, 32), (25026165, 32), (18742444, 32), (29758446, 31), (20880546, 31), (116952434, 31), (30207757, 31), (21195122, 31), (43003845, 31), (22784458, 30), (19040580, 30), (14691709, 30), (24004172, 30), (1065921, 30), (26280712, 30), (43170475, 30), (22462180, 30), (7846, 29), (1183041, 29), (3040621, 29), (7377812, 29), (34428380, 29), (20273398, 28), (14230524, 28), (15222083, 28), (24641194, 28), (83943787, 28), (36198161, 28), (14536491, 28), (813286, 27), (12611642, 27), (29514951, 27), (31353077, 27), (18996905, 27), (14471778, 27), (15838599, 27), (16112634, 27), (17526132, 27), (17759701, 27), (31331740, 27), (25952851, 27), (84043660, 27), (9431932, 27), (20397258, 27), (127973392, 27), (16674726, 27), (116036694, 27), (972651, 26), (13687132, 26), (4068821, 26), (7702232, 26), (6080022, 26), (17224642, 26), (20935355, 26), (24081780, 26), (26033920, 26), (28933226, 26), (43933017, 26), (101633415, 26)] 100 accounts that follow the most of the accounts found in 2) [(3359851, 47), (7860742, 47), (15913, 45), (7861312, 45), (16098603, 43), (18776017, 41), (10350, 39), (48485771, 39), (22679419, 38), (3443591, 37), (18581803, 37), (18927441, 37), (26281970, 37), (7872262, 36), (24742040, 36), (5442012, 36), (10671602, 35), (11928542, 35), (14922225, 35), (16453996, 35), (21681252, 35), (14589257, 34), (15853668, 34), (9451052, 34), (65913144, 34), (14269220, 34), (15234657, 34), (17092592, 34), (59804598, 34), (87764480, 34), (40981798, 34), (16475194, 34), (16464746, 33), (93905958, 33), (20152005, 33), (42361118, 33), (36629388, 33), (18666844, 32), (19413393, 32), (12127832, 32), (14180231, 32), (14983833, 32), (25026165, 32), (18742444, 32), (29758446, 31), (20880546, 31), (116952434, 31), (30207757, 31), (21195122, 31), (43003845, 31), (22784458, 30), (19040580, 30), (14691709, 30), (24004172, 30), (1065921, 30), (26280712, 30), (43170475, 30), (22462180, 30), (7846, 29), (1183041, 29), (3040621, 29), (7377812, 29), (34428380, 29), (20273398, 28), (14230524, 28), (15222083, 28), (24641194, 28), (83943787, 28), (36198161, 28), (14536491, 28), (813286, 27), (12611642, 27), (29514951, 27), (31353077, 27), (18996905, 27), (14471778, 27), (15838599, 27), (16112634, 27), (17526132, 27), (17759701, 27), (31331740, 27), (25952851, 27), (84043660, 27), (9431932, 27), (20397258, 27), (127973392, 27), (16674726, 27), (116036694, 27), (972651, 26), (13687132, 26), (4068821, 26), (7702232, 26), (6080022, 26), (17224642, 26), (20935355, 26), (24081780, 26), (26033920, 26), (28933226, 26), (43933017, 26), (101633415, 26)]
4. Listing the information for the personal home page of any account (best try with the accounts found in 2); the start page should contain the following (implement as separate queries): 4. Auflisten der Informationen für die persönliche Startseite eines beliebigen Accounts (am besten mit den in 2. gefundenen Accounts ausprobieren); die Startseite soll Folgendes beinhalten (als getrennte Queries umsetzen):
* the number of followers && the number of followed accounts * 4.1 und 4.2 die Anzahl der Follower && die Anzahl der verfolgten Accounts
SELECT follower_len, follows_len FROM twitter.user_stats WHERE user_id = 233248636; //cheack user_id can be changed ```SELECT follower_len, follows_len FROM twitter.user_stats WHERE user_id = 233248636; //check user_id can be changed```
![result_ex4_1](img/ex4_1.png) ![result_ex4_1](img/ex4_1.png)
* either the 25 newest or the 25 most popular posts of the followed accounts (via DB query) * 4.3 wahlweise die 25 neusten (zeitbasiert) oder die 25 beliebtesten Posts (like-basiert) der verfolgten Accounts (per DB-Abfrage)
25 newest 25 neuste
CREATE MATERIALIZED VIEW twitter.start_view_new AS CREATE MATERIALIZED VIEW twitter.start_view_new AS
SELECT user_id_x,follower_id,number_of_likes,number_of_shares,date_time,name,author,content,id FROM twitter.user SELECT user_id_x,follower_id,number_of_likes,number_of_shares,date_time,name,author,content,id FROM twitter.user
@ -74,8 +74,10 @@ print("100 accounts that follow the most of the accounts found in 2)",followed_t
![result_ex4_2](img/ex4_2_date.png) ![result_ex4_2](img/ex4_2_date.png)
25 most popular 25 beliebteste
// To order by the 25 most number_of_like is to our knowledge not possible with the current dataschema
CREATE MATERIALIZED VIEW twitter.start_view_like AS CREATE MATERIALIZED VIEW twitter.start_view_like AS
SELECT user_id_x,follower_id,number_of_likes,number_of_shares,date_time,author,name,content,id FROM twitter.user SELECT user_id_x,follower_id,number_of_likes,number_of_shares,date_time,author,name,content,id FROM twitter.user
WHERE user_id_x IS NOT NULL AND follower_id IS NOT NULL AND number_of_likes IS NOT NULL AND id IS NOT NULL WHERE user_id_x IS NOT NULL AND follower_id IS NOT NULL AND number_of_likes IS NOT NULL AND id IS NOT NULL
@ -85,12 +87,13 @@ print("100 accounts that follow the most of the accounts found in 2)",followed_t
![result_ex4_2](img/ex4_2_likes.png) ![result_ex4_2](img/ex4_2_likes.png)
5. Caching of the posts for the home page (cf. 4) requires a so-called fan-out in the cache of each follower when writing a new post * 4.4 Caching der Posts für die Startseite (vgl. 4), erfordert einen sog. Fan-Out in den Cache jedes Followers beim Schreiben eines neuen Posts
CREATE MATERIALIZED VIEW twitter.start_view_user1 AS CREATE MATERIALIZED VIEW twitter.start_view_user1 AS
SELECT follower_id,number_of_likes,date_time,author,name,content,id FROM twitter.user SELECT follower_id,number_of_likes,date_time,author,name,content,id FROM twitter.user
WHERE user_id_x IS NOT NULL AND user_id_x=172883064 AND follower_id IS NOT NULL AND number_of_likes IS NOT NULL AND id IS NOT NULL WHERE user_id_x IS NOT NULL AND user_id_x=172883064 AND follower_id IS NOT NULL AND number_of_likes IS NOT NULL AND id IS NOT NULL
PRIMARY KEY ((user_id_x),number_of_likes,follower_id,id); PRIMARY KEY ((user_id_x),number_of_likes,follower_id,id);
// # order by need partion key in WHERE // # order by need partion key in WHERE
SELECT * from twitter.start_view_taylor WHERE user_id_x=172883064 ORDER BY number_of_likes DESC LIMIT 25; SELECT * from twitter.start_view_taylor WHERE user_id_x=172883064 ORDER BY number_of_likes DESC LIMIT 25;
// INSERT new tweet // INSERT new tweet
@ -106,9 +109,8 @@ print("100 accounts that follow the most of the accounts found in 2)",followed_t
![result_ex4_2](img/ex4_2_likes.png) ![result_ex4_2](img/ex4_2_likes.png)
6. List of the 25 most popular posts that contain a given word (if possible also with AND linking several words) * 4.5 Auflisten der 25 beliebtesten Posts, die ein geg. Wort enthalten (falls möglich auch mit UND-Verknüpfung mehrerer Worte)
// To order by the 25 most number_of_like is at our knowledge not possible with the current dataschema
CREATE CUSTOM INDEX search_in ON twitter.tweets (content) USING 'org.apache.cassandra.index.sasi.SASIIndex' CREATE CUSTOM INDEX search_in ON twitter.tweets (content) USING 'org.apache.cassandra.index.sasi.SASIIndex'
WITH OPTIONS = { 'mode': 'CONTAINS', 'analyzer_class': 'org.apache.cassandra.index.sasi.analyzer.NonTokenizingAnalyzer', WITH OPTIONS = { 'mode': 'CONTAINS', 'analyzer_class': 'org.apache.cassandra.index.sasi.analyzer.NonTokenizingAnalyzer',
'case_sensitive': 'false' }; 'case_sensitive': 'false' };