////////////////////////////////////////////////////////////////
// reading
///////////////////////////////////////////////////////////////
// temp table w/ data skipping disabled
val readdf: dataframe =
spark.read.option(datasourcereadoptions.enable_data_skipping.key(), “false”).format(“hudi”).load(outputpath)
val rawsnapshottablename = safetablename(s”${tablename}_sql_snapshot”)
readdf.createorreplacetempview(rawsnapshottablename)
// temp table w/ data skipping enabled
val readdfskip: dataframe =
spark.read.option(datasourcereadoptions.enable_data_skipping.key(), “true”).format(“hudi”).load(outputpath)
val dataskippingsnapshottablename = safetablename(s”${tablename}_sql_snapshot_skipping”)
readdfskip.createorreplacetempview(dataskippingsnapshottablename)
// query 1: total votes by product_category, for 6 months
def runquery1(tablename: string) = {
// query 1: total votes by product_category, for 6 months
spark.sql(s”select sum(total_votes), product_category from $tablename where review_date > 2013-12-15 and review_date < 2014-06-01 group by product_category”).show()
}
// query 2: average star rating by product_id, for some product
def runquery2(tablename: string) = {
spark.sql(s”select avg(star_rating), product_id from $tablename where product_id in (b0184xc75u) group by product_id”).show()
}
// query 3: count number of reviews by customer_id for some 5 customers
def runquery3(tablename: string) = {
spark.sql(s”select count(*) as num_reviews, customer_id from $tablename where customer_id in (53096570,10046284,53096576,10000196,21700145) group by customer_id”).show()
}
//
// query 1: is a “wide” query and hence its expected to touch a lot of files
//
scala> runquery1(rawsnapshottablename)
+—————-+——————–+
|sum(total_votes)| product_category|
+—————-+——————–+
| 1050944| pc|
| 867794| kitchen|
| 1167489| home|
| 927531| wireless|
| 6861| video|
| 39602| digital_video_games|
| 954924|digital_video_dow…|
| 81876| luggage|
| 320536| video_games|
| 817679| sports|
| 11451| mobile_electronics|
| 228739| home_entertainment|
| 3769269|digital_ebook_pur…|
| 252273| baby|
| 735042| apparel|
| 49101| major_appliances|
| 484732| grocery|
| 285682| tools|
| 459980| electronics|
| 454258| outdoors|
+—————-+——————–+
only showing top 20 rows
scala> runquery1(dataskippingsnapshottablename)
+—————-+——————–+
|sum(total_votes)| product_category|
+—————-+——————–+
| 1050944| pc|
| 867794| kitchen|
| 1167489| home|
| 927531| wireless|
| 6861| video|
| 39602| digital_video_games|
| 954924|digital_video_dow…|
| 81876| luggage|
| 320536| video_games|
| 817679| sports|
| 11451| mobile_electronics|
| 228739| home_entertainment|
| 3769269|digital_ebook_pur…|
| 252273| baby|
| 735042| apparel|
| 49101| major_appliances|
| 484732| grocery|
| 285682| tools|
| 459980| electronics|
| 454258| outdoors|
+—————-+——————–+
only showing top 20 rows
//
// query 2: is a “pointwise” query and hence its expected that data-skipping should substantially reduce number
// of files scanned (as compared to baseline)
//
// note: that linear ordering (as compared to space-curve based on) will have similar effect on performance reducing
// total # of parquet files scanned, since were querying on the prefix of the ordering key
//
scala> runquery2(rawsnapshottablename)
+—————-+———-+
|avg(star_rating)|product_id|
+—————-+———-+
| 1.0|b0184xc75u|
+—————-+———-+
scala> runquery2(dataskippingsnapshottablename)
+—————-+———-+
|avg(star_rating)|product_id|
+—————-+———-+
| 1.0|b0184xc75u|
+—————-+———-+
//
// query 3: similar to q2, is a “pointwise” query, but querying other part of the ordering-key (product_id, customer_id)
// and hence its expected that data-skipping should substantially reduce number of files scanned (as compared to baseline, linear ordering).
//
// note: that linear ordering (as compared to space-curve based on) will _not_ have similar effect on performance reducing
// total # of parquet files scanned, since were not querying on the prefix of the ordering key
//
scala> runquery3(rawsnapshottablename)
+———–+———–+
|num_reviews|customer_id|
+———–+———–+
| 50| 53096570|
| 3| 53096576|
| 25| 10046284|
| 1| 10000196|
| 14| 21700145|
+———–+———–+
scala> runquery3(dataskippingsnapshottablename)
+———–+———–+
|num_reviews|customer_id|
+———–+———–+
| 50| 53096570|
| 3| 53096576|
| 25| 10046284|
| 1| 10000196|
| 14| 21700145|
+———–+———–+