From 41951f363cf82f2240d1f3f013e46a43f6062b73 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 3 Aug 2022 04:45:17 +0800 Subject: [PATCH 01/15] add datafusion --- datafusion/README.md | 21 ++++++++++++++++++++ datafusion/benchmark.sh | 23 ++++++++++++++++++++++ datafusion/create.sql | 3 +++ datafusion/queries.sql | 43 +++++++++++++++++++++++++++++++++++++++++ datafusion/run.sh | 24 +++++++++++++++++++++++ datafusion/run2.sh | 24 +++++++++++++++++++++++ 6 files changed, 138 insertions(+) create mode 100644 datafusion/README.md create mode 100644 datafusion/benchmark.sh create mode 100644 datafusion/create.sql create mode 100644 datafusion/queries.sql create mode 100644 datafusion/run.sh create mode 100644 datafusion/run2.sh diff --git a/datafusion/README.md b/datafusion/README.md new file mode 100644 index 000000000..9e0fe2514 --- /dev/null +++ b/datafusion/README.md @@ -0,0 +1,21 @@ +# Datafusion + +DataFusion is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. For more information, please check + +We use parquet file here and create an external table for it; and then do the queries. + + + +### to generate benchmark results: + +```bash +bash benchmark.sh +``` + + + +### to generate full human readable results + +1. install datafusion-cli +2. download the parquet ```wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet``` +3. execute it ```datafusion-cli -f create.sh queries.sh``` or ```bash run2.sh``` diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh new file mode 100644 index 000000000..fff1c1edc --- /dev/null +++ b/datafusion/benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Install Rust +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh +bash rust-init.sh -y +source .cargo/env + + +# Install Dependencies +sudo apt update -y +sudo apt install gcc -y + + +# Install Datafusion +cargo install --version 10.0.0 datafusion-cli + + +# Download benchmark target data +wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet + + +# Run +bash run.sh diff --git a/datafusion/create.sql b/datafusion/create.sql new file mode 100644 index 000000000..eedd4c038 --- /dev/null +++ b/datafusion/create.sql @@ -0,0 +1,3 @@ +CREATE EXTERNAL TABLE hits +STORED AS PARQUET +LOCATION 'hits.parquet'; diff --git a/datafusion/queries.sql b/datafusion/queries.sql new file mode 100644 index 000000000..f205e3c1b --- /dev/null +++ b/datafusion/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; +SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; +SELECT AVG("UserID") FROM hits; +SELECT COUNT(DISTINCT "UserID") FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; +SELECT MIN("EventDate"), MAX("EventDate") FROM hits; +SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; +SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; +SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; +SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; +SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', "EventTime") LIMIT 10 OFFSET 1000; diff --git a/datafusion/run.sh b/datafusion/run.sh new file mode 100644 index 000000000..f6ecb5964 --- /dev/null +++ b/datafusion/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +TRIES=3 +QUERY_NUM=1 +cat queries.sql | while read query; do + #sync + #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + echo $query > /tmp/query.sql + + echo -n "[" + for i in $(seq 1 $TRIES); do + RES=$(datafusion-cli -f create.sql /tmp/query.sql 2>&1 | tail -n 1) + [[ "$(echo $RES | awk '{print $5$6}')" == "Querytook" ]] && \ + echo -n "$(echo $RES | awk '{print $7}')" || \ + echo -n "null" + [[ "$i" != $TRIES ]] && echo -n ", " + + echo "${QUERY_NUM},${i},${RES}" >> result.csv + done + echo "]," + + QUERY_NUM=$((QUERY_NUM + 1)) +done diff --git a/datafusion/run2.sh b/datafusion/run2.sh new file mode 100644 index 000000000..0fe969b0d --- /dev/null +++ b/datafusion/run2.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +QUERY_NUM=1 +cat queries.sql | while read query; do + #sync + #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + echo $query > /tmp/query.sql + + echo + echo + echo ----------------------------------------- + echo + echo $QUERY_NUM. $query + echo + echo ----------------------------------------- + echo + echo + + + datafusion-cli -f create.sql /tmp/query.sql + + QUERY_NUM=$((QUERY_NUM + 1)) +done From 09cbf2d2319f43d55ca5a4f6442686a261884eb3 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 3 Aug 2022 16:19:09 +0800 Subject: [PATCH 02/15] fix run.sh --- datafusion/run.sh | 2 +- datafusion/run2.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/run.sh b/datafusion/run.sh index f6ecb5964..58931e5ac 100644 --- a/datafusion/run.sh +++ b/datafusion/run.sh @@ -6,7 +6,7 @@ cat queries.sql | while read query; do #sync #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo $query > /tmp/query.sql + echo "$query" > /tmp/query.sql echo -n "[" for i in $(seq 1 $TRIES); do diff --git a/datafusion/run2.sh b/datafusion/run2.sh index 0fe969b0d..938b155bb 100644 --- a/datafusion/run2.sh +++ b/datafusion/run2.sh @@ -5,20 +5,20 @@ cat queries.sql | while read query; do #sync #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null - echo $query > /tmp/query.sql + echo "$query" > /tmp/query.sql echo echo echo ----------------------------------------- echo - echo $QUERY_NUM. $query + echo $QUERY_NUM. "$query" echo echo ----------------------------------------- echo echo - datafusion-cli -f create.sql /tmp/query.sql + ./datafusion-cli -f create.sql /tmp/query.sql QUERY_NUM=$((QUERY_NUM + 1)) done From 3c8f8df761ebb8782274b0ab45f4d82633336850 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 3 Aug 2022 16:22:43 +0800 Subject: [PATCH 03/15] update run2.sh --- datafusion/run2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/run2.sh b/datafusion/run2.sh index 938b155bb..1d8221384 100644 --- a/datafusion/run2.sh +++ b/datafusion/run2.sh @@ -18,7 +18,7 @@ cat queries.sql | while read query; do echo - ./datafusion-cli -f create.sql /tmp/query.sql + datafusion-cli -f create.sql /tmp/query.sql QUERY_NUM=$((QUERY_NUM + 1)) done From 8078b23fc6870b9cd1845828b23c5fcaf8143e0b Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 3 Aug 2022 16:24:15 +0800 Subject: [PATCH 04/15] add /proc/sys/vm/drop_caches --- datafusion/run.sh | 4 ++-- datafusion/run2.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/run.sh b/datafusion/run.sh index 58931e5ac..fd01e8607 100644 --- a/datafusion/run.sh +++ b/datafusion/run.sh @@ -3,8 +3,8 @@ TRIES=3 QUERY_NUM=1 cat queries.sql | while read query; do - #sync - #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null echo "$query" > /tmp/query.sql diff --git a/datafusion/run2.sh b/datafusion/run2.sh index 1d8221384..901badd57 100644 --- a/datafusion/run2.sh +++ b/datafusion/run2.sh @@ -2,8 +2,8 @@ QUERY_NUM=1 cat queries.sql | while read query; do - #sync - #echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null echo "$query" > /tmp/query.sql From 3e6876f28b9f43f135771418d597fb902f0f25bb Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 4 Aug 2022 16:35:06 +0800 Subject: [PATCH 05/15] fix benchmark.sh --- datafusion/benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index fff1c1edc..2367e41bd 100644 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -3,7 +3,7 @@ # Install Rust curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh bash rust-init.sh -y -source .cargo/env +source ~/.cargo/env # Install Dependencies From 376af5f482750c0c9af7a011f7d310ad38f21eec Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 4 Aug 2022 17:13:38 +0800 Subject: [PATCH 06/15] result --- datafusion/results/f16s_v2.json | 58 +++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 datafusion/results/f16s_v2.json diff --git a/datafusion/results/f16s_v2.json b/datafusion/results/f16s_v2.json new file mode 100644 index 000000000..6c9e9bec8 --- /dev/null +++ b/datafusion/results/f16s_v2.json @@ -0,0 +1,58 @@ +{ + "system": "datafusion", + "date": "2022-07-01", + "machine": "f16s v2", + "cluster_size": 1, + "comment": "", + + "tags": ["rust", "column-oriented", "embedded", "stateless"], + + "load_time": 0, + "data_size": 14779976446, + + "result": [ +[1.221, 0.587, 0.565], +[0.489, 0.428, 0.424], +[1.213, 1.074, 1.071], +[0.948, 0.658, 0.643], +[2.770, 2.655, 2.635], +[null, null, null], +[0.474, 0.393, 0.393], +[0.493, 0.464, 0.468], +[3.659, 3.347, 3.354], +[6.146, 5.820, 5.619], +[2.237, 2.049, 1.910], +[2.823, 2.413, 2.348], +[null, null, null], +[null, null, null], +[null, null, null], +[4.696, 4.476, 4.423], +[null, null, null], +[null, null, null], +[null, null, null], +[0.770, 0.592, 0.570], +[12.369, 10.995, 10.819], +[16.187, 14.367, 14.240], +[39.799, 36.780, 36.316], +[122.764, 114.359, 114.303], +[6.006, 5.350, 5.159], +[7.816, 7.445, 7.450], +[9.817, 9.699, 9.472], +[18.702, 17.448, 17.686], +[268.486, 271.835, 274.347], +[3.631, 4.118, 4.110], +[8.540, 7.864, 7.848], +[10.325, 9.326, 9.221], +[0.047, 0.047, 0.047], +[null, null, null], +[null, null, null], +[5.091, 4.903, 4.977], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null], +[null, null, null] +] +} From f43f0e1637e85ddf3ada46067d4c293bf2c5090f Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 18:57:49 +0800 Subject: [PATCH 07/15] Update datafusion/benchmark.sh Co-authored-by: Ilya Yatsishin <2159081+qoega@users.noreply.github.com> --- datafusion/benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index fff1c1edc..2367e41bd 100644 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -3,7 +3,7 @@ # Install Rust curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh bash rust-init.sh -y -source .cargo/env +source ~/.cargo/env # Install Dependencies From 75ba7958bdc356c513654d9373a561722827a009 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 23:20:43 +0800 Subject: [PATCH 08/15] update benchmark to use master branch --- datafusion/benchmark.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/benchmark.sh b/datafusion/benchmark.sh index 2367e41bd..32e66af21 100644 --- a/datafusion/benchmark.sh +++ b/datafusion/benchmark.sh @@ -12,7 +12,13 @@ sudo apt install gcc -y # Install Datafusion -cargo install --version 10.0.0 datafusion-cli +#cargo install --version 10.0.0 datafusion-cli + +# Install Datafusion Master Branch +git clone https://github.com/apache/arrow-datafusion.git +cd arrow-datafusion/datafusion-cli && cargo build --release +export PATH="`pwd`/target/release:$PATH" +cd ../.. # Download benchmark target data From 868dc775c801d23a41497aab7847965fc4975e97 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 23:22:27 +0800 Subject: [PATCH 09/15] update create and queries --- datafusion/queries.sql | 24 ++++----- datafusion/results/f16s_v2.json | 86 ++++++++++++++++----------------- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/datafusion/queries.sql b/datafusion/queries.sql index f205e3c1b..52e72e02e 100644 --- a/datafusion/queries.sql +++ b/datafusion/queries.sql @@ -4,7 +4,7 @@ SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; SELECT AVG("UserID") FROM hits; SELECT COUNT(DISTINCT "UserID") FROM hits; SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; -SELECT MIN("EventDate"), MAX("EventDate") FROM hits; +SELECT MIN("EventDate"::INT::DATE), MAX("EventDate"::INT::DATE) FROM hits; SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; @@ -16,15 +16,15 @@ SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPh SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; -SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10; SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; @@ -34,10 +34,10 @@ SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWi SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; -SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100; -SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', "EventTime") LIMIT 10 OFFSET 1000; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URLHash", "EventDate"::INT::DATE, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate"::INT::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; diff --git a/datafusion/results/f16s_v2.json b/datafusion/results/f16s_v2.json index 6c9e9bec8..4cbad7fa3 100644 --- a/datafusion/results/f16s_v2.json +++ b/datafusion/results/f16s_v2.json @@ -11,48 +11,48 @@ "data_size": 14779976446, "result": [ -[1.221, 0.587, 0.565], -[0.489, 0.428, 0.424], -[1.213, 1.074, 1.071], -[0.948, 0.658, 0.643], -[2.770, 2.655, 2.635], -[null, null, null], -[0.474, 0.393, 0.393], -[0.493, 0.464, 0.468], -[3.659, 3.347, 3.354], -[6.146, 5.820, 5.619], -[2.237, 2.049, 1.910], -[2.823, 2.413, 2.348], -[null, null, null], -[null, null, null], -[null, null, null], -[4.696, 4.476, 4.423], -[null, null, null], -[null, null, null], -[null, null, null], -[0.770, 0.592, 0.570], -[12.369, 10.995, 10.819], -[16.187, 14.367, 14.240], -[39.799, 36.780, 36.316], -[122.764, 114.359, 114.303], -[6.006, 5.350, 5.159], -[7.816, 7.445, 7.450], -[9.817, 9.699, 9.472], -[18.702, 17.448, 17.686], -[268.486, 271.835, 274.347], -[3.631, 4.118, 4.110], -[8.540, 7.864, 7.848], -[10.325, 9.326, 9.221], -[0.047, 0.047, 0.047], -[null, null, null], -[null, null, null], -[5.091, 4.903, 4.977], -[null, null, null], -[null, null, null], -[null, null, null], -[null, null, null], -[null, null, null], -[null, null, null], -[null, null, null] +[1.178, 0.563, 0.519], +[0.473, 0.440, 0.406], +[1.102, 0.996, 0.995], +[0.924, 0.655, 0.658], +[2.518, 2.371, 2.440], +[4.933, 4.537, 4.475], +[0.521, 0.446, 0.478], +[0.454, 0.419, 0.446], +[3.345, 3.056, 3.035], +[5.525, 5.180, 5.137], +[1.932, 1.630, 1.620], +[2.374, 1.987, 1.994], +[5.235, 4.867, 4.909], +[7.099, 6.743, 6.863], +[5.873, 5.563, 5.499], +[4.297, 4.234, 4.072], +[10.163, 9.914, 9.764], +[7.985, 7.621, 7.568], +[17.371, 16.899, 16.937], +[0.740, 0.578, 0.567], +[11.948, 10.504, 10.532], +[15.448, 13.610, 13.618], +[38.965, 35.923, 35.557], +[112.691, 105.469, 105.010], +[5.747, 4.984, 5.024], +[7.265, 7.061, 6.955], +[9.385, 9.062, 9.098], +[11.937, 10.473, 10.354], +[252.987, 258.094, 259.536], +[2.270, 2.675, 2.674], +[8.146, 7.186, 7.266], +[9.498, 8.536, 8.536], +[0.048, 0.048, 0.048], +[21.958, 20.714, 20.281], +[22.049, 21.114, 20.961], +[4.599, 4.440, 4.381], +[0.360, 0.324, 0.326], +[0.299, 0.289, 0.294], +[0.276, 0.253, 0.255], +[0.654, 0.658, 0.635], +[0.130, 0.112, 0.112], +[0.124, 0.106, 0.109], +[0.107, 0.097, 0.096] ] } From dc60be1410f8a413fbeedfd5018c869190267913 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 23:27:46 +0800 Subject: [PATCH 10/15] update readme --- datafusion/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/README.md b/datafusion/README.md index 9e0fe2514..d2c3b870f 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -13,8 +13,14 @@ bash benchmark.sh ``` +### Know Issues: -### to generate full human readable results +1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in quries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) +2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in quries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) +3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these quries in mac, you'll get some errors for quries contain binary format + + +### to generate full human readable results (for debugging) 1. install datafusion-cli 2. download the parquet ```wget --continue https://datasets.clickhouse.com/hits_compatible/hits.parquet``` From 87d2857dad684480fd0778fb140c2f8916074f2b Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 23:37:35 +0800 Subject: [PATCH 11/15] Update README.md --- datafusion/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/README.md b/datafusion/README.md index d2c3b870f..50e63ecb9 100644 --- a/datafusion/README.md +++ b/datafusion/README.md @@ -17,7 +17,7 @@ bash benchmark.sh 1. importing parquet by `datafusion-cli` doesn't support schema, need to add some casting in quries.sql (e.g. converting EventTime from Int to Timestamp via `to_timestamp_seconds`) 2. importing parquet by `datafusion-cli` make column name column name case-sensitive, i change all column name in quries.sql to double quoted literal (e.g. `EventTime` -> `"EventTime"`) -3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these quries in mac, you'll get some errors for quries contain binary format +3. `comparing binary with utf-8` and `group by binary` don't work in mac, if you run these quries in mac, you'll get some errors for quries contain binary format apache/arrow-datafusion#3050 ### to generate full human readable results (for debugging) From 765d0201ed912d7e2f861b9ae8ab5cebd0d2c0e5 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Wed, 10 Aug 2022 23:55:56 +0800 Subject: [PATCH 12/15] Update f16s_v2.json --- datafusion/results/f16s_v2.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/results/f16s_v2.json b/datafusion/results/f16s_v2.json index 4cbad7fa3..c54f638a0 100644 --- a/datafusion/results/f16s_v2.json +++ b/datafusion/results/f16s_v2.json @@ -1,6 +1,6 @@ { "system": "datafusion", - "date": "2022-07-01", + "date": "2022-08-10", "machine": "f16s v2", "cluster_size": 1, "comment": "", From 46e33d509eb04fd2a9a8388713d522fdc7b166ca Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 11 Aug 2022 01:01:46 +0800 Subject: [PATCH 13/15] update quries.sql --- datafusion/queries.sql | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/datafusion/queries.sql b/datafusion/queries.sql index 52e72e02e..fbffa4d35 100644 --- a/datafusion/queries.sql +++ b/datafusion/queries.sql @@ -3,41 +3,41 @@ SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; SELECT AVG("UserID") FROM hits; SELECT COUNT(DISTINCT "UserID") FROM hits; -SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase"::TEXT) FROM hits; SELECT MIN("EventDate"::INT::DATE), MAX("EventDate"::INT::DATE) FROM hits; SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; -SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; -SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; -SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; -SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel"::TEXT <> '' GROUP BY "MobilePhoneModel"::TEXT ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel"::TEXT <> '' GROUP BY "MobilePhone", "MobilePhoneModel"::TEXT ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase"::TEXT, COUNT(*) AS c FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase"::TEXT, COUNT(*) AS c FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchEngineID", "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; -SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase"::TEXT ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase"::TEXT LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase"::TEXT ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; -SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; -SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10; -SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT COUNT(*) FROM hits WHERE "URL"::TEXT LIKE '%google%'; +SELECT "SearchPhrase"::TEXT, MIN("URL"::TEXT), COUNT(*) AS c FROM hits WHERE "URL"::TEXT LIKE '%google%' AND "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase"::TEXT, MIN("URL"::TEXT), MIN("Title"::TEXT), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title"::TEXT LIKE '%Google%' AND "URL"::TEXT NOT LIKE '%.google.%' AND "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL"::TEXT LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY "SearchPhrase"::TEXT LIMIT 10; +SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase"::TEXT LIMIT 10; +SELECT "CounterID", AVG(length("URL"::TEXT)) AS l, COUNT(*) AS c FROM hits WHERE "URL"::TEXT <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer"::TEXT, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer"::TEXT)) AS l, COUNT(*) AS c, MIN("Referer"::TEXT) FROM hits WHERE "Referer"::TEXT <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; -SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; -SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; +SELECT "URL"::TEXT, COUNT(*) AS c FROM hits GROUP BY "URL"::TEXT ORDER BY c DESC LIMIT 10; +SELECT 1, "URL"::TEXT, COUNT(*) AS c FROM hits GROUP BY 1, "URL"::TEXT ORDER BY c DESC LIMIT 10; SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; -SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; -SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URL"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL"::TEXT <> '' GROUP BY "URL"::TEXT ORDER BY PageViews DESC LIMIT 10; +SELECT "Title"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title"::TEXT <> '' GROUP BY "Title"::TEXT ORDER BY PageViews DESC LIMIT 10; +SELECT "URL"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL"::TEXT ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer"::TEXT ELSE '' END AS Src, "URL"::TEXT AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT "URLHash", "EventDate"::INT::DATE, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate"::INT::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; From d9aded9124bc81eaa8d8a11c56f4f8d162ff8a91 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 11 Aug 2022 01:58:32 +0800 Subject: [PATCH 14/15] update queries.sql --- datafusion/queries.sql | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/datafusion/queries.sql b/datafusion/queries.sql index fbffa4d35..52e72e02e 100644 --- a/datafusion/queries.sql +++ b/datafusion/queries.sql @@ -3,41 +3,41 @@ SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0; SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits; SELECT AVG("UserID") FROM hits; SELECT COUNT(DISTINCT "UserID") FROM hits; -SELECT COUNT(DISTINCT "SearchPhrase"::TEXT) FROM hits; +SELECT COUNT(DISTINCT "SearchPhrase") FROM hits; SELECT MIN("EventDate"::INT::DATE), MAX("EventDate"::INT::DATE) FROM hits; SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC; SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10; SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10; -SELECT "MobilePhoneModel"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel"::TEXT <> '' GROUP BY "MobilePhoneModel"::TEXT ORDER BY u DESC LIMIT 10; -SELECT "MobilePhone", "MobilePhoneModel"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel"::TEXT <> '' GROUP BY "MobilePhone", "MobilePhoneModel"::TEXT ORDER BY u DESC LIMIT 10; -SELECT "SearchPhrase"::TEXT, COUNT(*) AS c FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase"::TEXT, COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY u DESC LIMIT 10; -SELECT "SearchEngineID", "SearchPhrase"::TEXT, COUNT(*) AS c FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchEngineID", "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; +SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10; +SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10; SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase"::TEXT ORDER BY COUNT(*) DESC LIMIT 10; -SELECT "UserID", "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase"::TEXT LIMIT 10; -SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase"::TEXT, COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase"::TEXT ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; +SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10; +SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE "URL"::TEXT LIKE '%google%'; -SELECT "SearchPhrase"::TEXT, MIN("URL"::TEXT), COUNT(*) AS c FROM hits WHERE "URL"::TEXT LIKE '%google%' AND "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; -SELECT "SearchPhrase"::TEXT, MIN("URL"::TEXT), MIN("Title"::TEXT), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title"::TEXT LIKE '%Google%' AND "URL"::TEXT NOT LIKE '%.google.%' AND "SearchPhrase"::TEXT <> '' GROUP BY "SearchPhrase"::TEXT ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE "URL"::TEXT LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; -SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; -SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY "SearchPhrase"::TEXT LIMIT 10; -SELECT "SearchPhrase"::TEXT FROM hits WHERE "SearchPhrase"::TEXT <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase"::TEXT LIMIT 10; -SELECT "CounterID", AVG(length("URL"::TEXT)) AS l, COUNT(*) AS c FROM hits WHERE "URL"::TEXT <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE("Referer"::TEXT, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer"::TEXT)) AS l, COUNT(*) AS c, MIN("Referer"::TEXT) FROM hits WHERE "Referer"::TEXT <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%'; +SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10; +SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10; +SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits; -SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase"::TEXT <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10; +SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10; -SELECT "URL"::TEXT, COUNT(*) AS c FROM hits GROUP BY "URL"::TEXT ORDER BY c DESC LIMIT 10; -SELECT 1, "URL"::TEXT, COUNT(*) AS c FROM hits GROUP BY 1, "URL"::TEXT ORDER BY c DESC LIMIT 10; +SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10; +SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10; SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10; -SELECT "URL"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL"::TEXT <> '' GROUP BY "URL"::TEXT ORDER BY PageViews DESC LIMIT 10; -SELECT "Title"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title"::TEXT <> '' GROUP BY "Title"::TEXT ORDER BY PageViews DESC LIMIT 10; -SELECT "URL"::TEXT, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL"::TEXT ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer"::TEXT ELSE '' END AS Src, "URL"::TEXT AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10; +SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10; +SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT "URLHash", "EventDate"::INT::DATE, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate"::INT::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; From bcbb94a904fae8702f3632d5642ebe8863ec4414 Mon Sep 17 00:00:00 2001 From: Wei-Ting Kuo Date: Thu, 11 Aug 2022 02:52:51 +0800 Subject: [PATCH 15/15] update result --- datafusion/results/f16s_v2.json | 86 ++++++++++++++++----------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/datafusion/results/f16s_v2.json b/datafusion/results/f16s_v2.json index c54f638a0..a0f9ebd1c 100644 --- a/datafusion/results/f16s_v2.json +++ b/datafusion/results/f16s_v2.json @@ -11,48 +11,48 @@ "data_size": 14779976446, "result": [ -[1.178, 0.563, 0.519], -[0.473, 0.440, 0.406], -[1.102, 0.996, 0.995], -[0.924, 0.655, 0.658], -[2.518, 2.371, 2.440], -[4.933, 4.537, 4.475], -[0.521, 0.446, 0.478], -[0.454, 0.419, 0.446], -[3.345, 3.056, 3.035], -[5.525, 5.180, 5.137], -[1.932, 1.630, 1.620], -[2.374, 1.987, 1.994], -[5.235, 4.867, 4.909], -[7.099, 6.743, 6.863], -[5.873, 5.563, 5.499], -[4.297, 4.234, 4.072], -[10.163, 9.914, 9.764], -[7.985, 7.621, 7.568], -[17.371, 16.899, 16.937], -[0.740, 0.578, 0.567], -[11.948, 10.504, 10.532], -[15.448, 13.610, 13.618], -[38.965, 35.923, 35.557], -[112.691, 105.469, 105.010], -[5.747, 4.984, 5.024], -[7.265, 7.061, 6.955], -[9.385, 9.062, 9.098], -[11.937, 10.473, 10.354], -[252.987, 258.094, 259.536], -[2.270, 2.675, 2.674], -[8.146, 7.186, 7.266], -[9.498, 8.536, 8.536], -[0.048, 0.048, 0.048], -[21.958, 20.714, 20.281], -[22.049, 21.114, 20.961], -[4.599, 4.440, 4.381], -[0.360, 0.324, 0.326], -[0.299, 0.289, 0.294], -[0.276, 0.253, 0.255], -[0.654, 0.658, 0.635], -[0.130, 0.112, 0.112], -[0.124, 0.106, 0.109], -[0.107, 0.097, 0.096] +[0.992, 0.567, 0.542], +[0.468, 0.412, 0.424], +[1.074, 0.996, 0.983], +[0.951, 0.650, 0.636], +[2.506, 2.397, 2.430], +[4.972, 4.557, 4.570], +[0.481, 0.508, 0.451], +[0.459, 0.431, 0.442], +[3.324, 3.073, 3.085], +[5.578, 5.204, 5.212], +[1.904, 1.635, 1.629], +[2.438, 2.042, 2.122], +[5.122, 4.989, 5.045], +[7.142, 6.860, 6.853], +[5.931, 5.540, 5.645], +[4.288, 4.154, 4.186], +[10.123, 9.914, 9.853], +[8.098, 7.674, 7.705], +[17.694, 16.997, 16.956], +[0.734, 0.575, 0.584], +[11.867, 10.749, 10.736], +[15.467, 13.738, 13.599], +[38.945, 35.948, 35.709], +[114.322, 107.217, 107.070], +[5.727, 5.125, 5.055], +[7.367, 7.111, 7.042], +[9.463, 9.048, 9.134], +[12.088, 10.382, 10.508], +[252.422, 254.793, 257.960], +[2.262, 2.666, 2.655], +[7.979, 7.210, 7.222], +[9.422, 8.669, 8.609], +[0.050, 0.049, 0.048], +[21.977, 20.811, 20.672], +[22.026, 20.822, 20.878], +[4.580, 4.430, 4.469], +[0.373, 0.330, 0.344], +[0.311, 0.299, 0.280], +[0.285, 0.259, 0.257], +[0.691, 0.634, 0.624], +[0.131, 0.116, 0.112], +[0.122, 0.106, 0.108], +[0.105, 0.095, 0.099] ] }