From a06766ff71568e1e94bd59d128a876dc90b74d12 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sat, 8 Jul 2023 22:11:55 -0400 Subject: [PATCH 01/18] clap instead of structopt --- Cargo.lock | 251 +++++++++++++++++++++++++++++--------------- README.md | 8 +- bench/Cargo.toml | 2 +- bench/src/main.rs | 18 ++-- pco_cli/Cargo.toml | 2 +- pco_cli/src/main.rs | 6 +- pco_cli/src/opt.rs | 54 +++++----- 7 files changed, 213 insertions(+), 128 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74b94762..e59d07cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,12 +40,52 @@ dependencies = [ ] [[package]] -name = "ansi_term" -version = "0.12.1" +name = "anstream" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" dependencies = [ - "winapi", + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is-terminal", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" + +[[package]] +name = "anstyle-parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", ] [[package]] @@ -242,17 +282,6 @@ dependencies = [ "regex-syntax 0.7.3", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -269,9 +298,9 @@ checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" name = "bench" version = "0.0.0" dependencies = [ + "clap", "pco", "q_compress", - "structopt", "tabled", "zstd", ] @@ -282,6 +311,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" + [[package]] name = "bstr" version = "0.2.17" @@ -355,19 +390,51 @@ dependencies = [ [[package]] name = "clap" -version = "2.34.0" +version = "4.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d" dependencies = [ - "ansi_term", - "atty", - "bitflags", + "clap_builder", + "clap_derive", + "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", "strsim", - "textwrap", - "unicode-width", - "vec_map", ] +[[package]] +name = "clap_derive" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8cd2b2a819ad6eec39e8f1d6b53001af1e5469f8c177579cdaeb313115b825f" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.23", +] + +[[package]] +name = "clap_lex" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + [[package]] name = "const-random" version = "0.1.15" @@ -444,13 +511,34 @@ dependencies = [ "syn 2.0.23", ] +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "flatbuffers" version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ - "bitflags", + "bitflags 1.3.2", "rustc_version", ] @@ -577,15 +665,6 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - [[package]] name = "heck" version = "0.4.1" @@ -601,6 +680,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" + [[package]] name = "iana-time-zone" version = "0.1.57" @@ -630,6 +715,17 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi 0.3.2", + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "itoa" version = "0.4.8" @@ -736,6 +832,12 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" +[[package]] +name = "linux-raw-sys" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0" + [[package]] name = "lock_api" version = "0.4.7" @@ -876,7 +978,7 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", ] @@ -926,7 +1028,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-sys", + "windows-sys 0.36.1", ] [[package]] @@ -978,11 +1080,11 @@ version = "0.0.0-alpha.0" dependencies = [ "anyhow", "arrow", + "clap", "enum-iterator", "num-complex", "parquet", "pco", - "structopt", ] [[package]] @@ -1102,7 +1204,7 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -1143,6 +1245,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "0.38.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4" +dependencies = [ + "bitflags 2.3.3", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.48.0", +] + [[package]] name = "ryu" version = "1.0.9" @@ -1218,33 +1333,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - -[[package]] -name = "structopt" -version = "0.3.26" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck 0.3.3", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.86", -] +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" @@ -1285,22 +1376,13 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro-error", "proc-macro2", "quote", "syn 1.0.86", ] -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - [[package]] name = "thrift" version = "0.17.0" @@ -1368,12 +1450,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73" -[[package]] -name = "unicode-segmentation" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" - [[package]] name = "unicode-width" version = "0.1.10" @@ -1387,10 +1463,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] -name = "vec_map" -version = "0.8.2" +name = "utf8parse" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "version_check" @@ -1502,6 +1578,15 @@ dependencies = [ "windows_x86_64_msvc 0.36.1", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-targets" version = "0.48.1" diff --git a/README.md b/README.md index c5ad72e5..c55c8675 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ # Pcodec -Pcodec (or pco) losslessly compresses and decompresses numerical sequences +Pcodec (or pco, pronounced "pico") losslessly compresses and decompresses +numerical sequences with high compression ratio and moderately fast speed. **Use cases:** @@ -47,8 +48,7 @@ See [benchmarks.md](./bench/benchmarks.md) or run the benchmark suite via ## Etymology -Pco should be pronounced "pico". The names pcodec and pco were chosen for these -reasons: +The names pcodec and pco were chosen for these reasons: * "Pico" suggests that it makes very small things. * Pco is reminiscent of qco, its preceding format. * Pco is reminiscent of PancakeDB (Pancake COmpressed). Though PancakeDB is now @@ -57,6 +57,6 @@ reasons: search for. The names are used for these purposes: -* pco => the library, data format, and command line tool +* pco => the library and data format * pco_cli => the binary crate name * pcodec => the binary CLI and the repo \ No newline at end of file diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 2934cbdd..71aba695 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -9,8 +9,8 @@ name = "bench" path = "src/main.rs" [dependencies] +clap = {version = "4.3.11", features = ["derive"]} pco = {path = "../pco" } q_compress = {path = "../quantile-compression/q_compress" } -structopt = "0.3.26" tabled = "0.12.2" zstd = "0.12" diff --git a/bench/src/main.rs b/bench/src/main.rs index 2b83b7a7..ae6e1c19 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -7,7 +7,7 @@ use std::ops::AddAssign; use std::path::Path; use std::time::{Duration, Instant}; -use structopt::StructOpt; +use clap::Parser; use tabled::settings::object::Columns; use tabled::settings::{Alignment, Modify, Style}; use tabled::{Table, Tabled}; @@ -19,19 +19,19 @@ const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order const AUTO_DELTA: usize = usize::MAX; -#[derive(StructOpt)] +#[derive(Parser)] struct Opt { - #[structopt(long, short, default_value = "all")] + #[arg(long, short, default_value = "all")] datasets: String, - #[structopt(long, short, default_value = "10")] + #[arg(long, short, default_value = "10")] pub iters: usize, - #[structopt(long, short, default_value = "pco")] + #[arg(long, short, default_value = "pco")] compressors: String, - #[structopt(long)] + #[arg(long)] pub no_compress: bool, - #[structopt(long)] + #[arg(long)] pub no_decompress: bool, - #[structopt(long)] + #[arg(long)] pub no_assertions: bool, } @@ -449,7 +449,7 @@ fn print_stats(stats: &[BenchStat]) { } fn main() { - let opt: Opt = Opt::from_args(); + let opt: Opt = Opt::parse(); let files = fs::read_dir(format!("{}/binary", BASE_DIR)).expect("couldn't read"); let mut paths = files diff --git a/pco_cli/Cargo.toml b/pco_cli/Cargo.toml index eff7078c..331d222d 100644 --- a/pco_cli/Cargo.toml +++ b/pco_cli/Cargo.toml @@ -18,9 +18,9 @@ path = "src/main.rs" [dependencies] anyhow = "1.0.53" arrow = {version = "43.0.0", features = ["csv"], default-features=false} +clap = {version = "4.3.11", features = ["derive"]} num-complex = "0.4.3" parquet = {version = "43.0.0", features = ["arrow", "base64", "snap", "zstd"], default-features=false} -structopt = "0.3.26" pco = {version = "0.0.0-alpha.0", path = "../pco" } [dev-dependencies] diff --git a/pco_cli/src/main.rs b/pco_cli/src/main.rs index ff163aab..0d8ff9e1 100644 --- a/pco_cli/src/main.rs +++ b/pco_cli/src/main.rs @@ -1,7 +1,7 @@ use anyhow::Result; -use structopt::StructOpt; +use clap::Parser; -use crate::opt::Opt; +use crate::opt::{Opt, OptWrapper}; mod compress; mod compress_handler; @@ -16,7 +16,7 @@ mod opt; mod utils; fn main() -> Result<()> { - let opt = Opt::from_args(); + let opt = OptWrapper::parse().opt; match opt { Opt::Compress(compress_opt) => compress::compress(compress_opt)?, Opt::Decompress(decompress_opt) => decompress::decompress(decompress_opt)?, diff --git a/pco_cli/src/opt.rs b/pco_cli/src/opt.rs index ebed6340..b7750f51 100644 --- a/pco_cli/src/opt.rs +++ b/pco_cli/src/opt.rs @@ -1,56 +1,56 @@ use std::path::PathBuf; +use clap::{Parser, Subcommand}; use anyhow::anyhow; use anyhow::Result; -use structopt::StructOpt; use crate::dtype::DType; -#[derive(Clone, Debug, StructOpt)] -#[structopt { - name = "pcodec CLI", - about = "A command line tool to compress, decompress, and inspect .pco files", -}] +#[derive(Clone, Debug, Parser)] +#[command(about = "A command line tool to compress, decompress, and inspect .pco files")] +pub struct OptWrapper { + #[command(subcommand)] + pub opt: Opt, +} + +#[derive(Subcommand, Clone, Debug)] pub enum Opt { - #[structopt(name = "compress")] Compress(CompressOpt), - #[structopt(name = "decompress")] Decompress(DecompressOpt), - #[structopt(name = "inspect")] Inspect(InspectOpt), } -#[derive(Clone, Debug, StructOpt)] +#[derive(Clone, Debug, Parser)] pub struct CompressOpt { - #[structopt(long = "csv")] + #[arg(long = "csv")] pub csv_path: Option, - #[structopt(long = "parquet")] + #[arg(long = "parquet")] pub parquet_path: Option, - #[structopt(long, default_value = "8")] + #[arg(long, default_value = "8")] pub level: usize, - #[structopt(long = "delta-order")] + #[arg(long = "delta-order")] pub delta_encoding_order: Option, - #[structopt(long)] + #[arg(long)] pub disable_gcds: bool, - #[structopt(long)] + #[arg(long)] pub dtype: Option, - #[structopt(long)] + #[arg(long)] pub col_name: Option, - #[structopt(long)] + #[arg(long)] pub col_idx: Option, - #[structopt(long, default_value = "1000000")] + #[arg(long, default_value = "1000000")] pub chunk_size: usize, - #[structopt(long)] + #[arg(long)] pub overwrite: bool, - #[structopt(long = "csv-has-header")] + #[arg(long = "csv-has-header")] pub has_csv_header: bool, - #[structopt( + #[arg( long = "csv-timestamp-format", default_value = "%Y-%m-%dT%H:%M:%S%.f%z" )] pub timestamp_format: String, - #[structopt(long = "csv-delimiter", default_value = ",")] + #[arg(long = "csv-delimiter", default_value = ",")] pub delimiter: char, pub pco_path: PathBuf, @@ -70,17 +70,17 @@ impl CompressOpt { } } -#[derive(Clone, Debug, StructOpt)] +#[derive(Clone, Debug, Parser)] pub struct DecompressOpt { - #[structopt(long)] + #[arg(long)] pub limit: Option, - #[structopt(long, default_value = "%Y-%m-%dT%H:%M:%S%.f")] + #[arg(long, default_value = "%Y-%m-%dT%H:%M:%S%.f")] pub timestamp_format: String, pub pco_path: PathBuf, } -#[derive(Clone, Debug, StructOpt)] +#[derive(Clone, Debug, Parser)] pub struct InspectOpt { pub path: PathBuf, } From 52413e120f9b25ba102a3ae32bb58d3ed01f58ec Mon Sep 17 00:00:00 2001 From: mwlon Date: Sat, 8 Jul 2023 22:41:16 -0400 Subject: [PATCH 02/18] split out codec config and opt --- bench/src/codec_config.rs | 45 ++++++++++++ bench/src/main.rs | 149 ++++++-------------------------------- bench/src/opt.rs | 60 +++++++++++++++ pco_cli/src/opt.rs | 5 +- 4 files changed, 130 insertions(+), 129 deletions(-) create mode 100644 bench/src/codec_config.rs create mode 100644 bench/src/opt.rs diff --git a/bench/src/codec_config.rs b/bench/src/codec_config.rs new file mode 100644 index 00000000..2b359c8e --- /dev/null +++ b/bench/src/codec_config.rs @@ -0,0 +1,45 @@ +use std::fmt::{Display, Formatter}; + +#[derive(Clone, Debug)] +pub enum CodecConfig { + Pco(pco::CompressorConfig), + QCompress(q_compress::CompressorConfig), + ZStd(usize), +} + +impl CodecConfig { + pub fn codec(&self) -> &'static str { + match self { + CodecConfig::Pco(_) => "pco", + CodecConfig::QCompress(_) => "qco", + CodecConfig::ZStd(_) => "zstd", + } + } + + pub fn details(&self) -> String { + match self { + CodecConfig::Pco(config) => { + format!( + "{}:{}:{}", + config.compression_level, config.delta_encoding_order, config.use_gcds + ) + } + CodecConfig::QCompress(config) => { + format!( + "{}:{}:{}", + config.compression_level, config.delta_encoding_order, config.use_gcds + ) + } + CodecConfig::ZStd(level) => { + format!("{}", level) + } + } + } +} + +impl Display for CodecConfig { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.codec(), self.details()) + } +} + diff --git a/bench/src/main.rs b/bench/src/main.rs index ae6e1c19..9a9c025a 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -1,6 +1,8 @@ #![allow(clippy::useless_transmute)] -use std::fmt::{Display, Formatter}; +mod opt; +mod codec_config; + use std::fs; use std::io::ErrorKind; use std::ops::AddAssign; @@ -12,28 +14,14 @@ use tabled::settings::object::Columns; use tabled::settings::{Alignment, Modify, Style}; use tabled::{Table, Tabled}; +use opt::Opt; +use codec_config::CodecConfig; use pco::data_types::NumberLike as DNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; +use crate::opt::AUTO_DELTA; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order -const AUTO_DELTA: usize = usize::MAX; - -#[derive(Parser)] -struct Opt { - #[arg(long, short, default_value = "all")] - datasets: String, - #[arg(long, short, default_value = "10")] - pub iters: usize, - #[arg(long, short, default_value = "pco")] - compressors: String, - #[arg(long)] - pub no_compress: bool, - #[arg(long)] - pub no_decompress: bool, - #[arg(long)] - pub no_assertions: bool, -} trait NumberLike: QNumberLike { type Pco: DNumberLike; @@ -63,99 +51,6 @@ impl_pco_number_like!(f32, f32); impl_pco_number_like!(f64, f64); impl_pco_number_like!(TimestampMicros, i64); -#[derive(Clone, Debug)] -enum MultiCompressorConfig { - Pco(pco::CompressorConfig), - QCompress(q_compress::CompressorConfig), - ZStd(usize), -} - -impl MultiCompressorConfig { - pub fn codec(&self) -> &'static str { - match self { - MultiCompressorConfig::Pco(_) => "pco", - MultiCompressorConfig::QCompress(_) => "qco", - MultiCompressorConfig::ZStd(_) => "zstd", - } - } - - pub fn details(&self) -> String { - match self { - MultiCompressorConfig::Pco(config) => { - format!( - "{}:{}:{}", - config.compression_level, config.delta_encoding_order, config.use_gcds - ) - } - MultiCompressorConfig::QCompress(config) => { - format!( - "{}:{}:{}", - config.compression_level, config.delta_encoding_order, config.use_gcds - ) - } - MultiCompressorConfig::ZStd(level) => { - format!("{}", level,) - } - } - } -} - -impl Display for MultiCompressorConfig { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}", self.codec(), self.details(),) - } -} - -impl Opt { - pub fn get_datasets(&self) -> Vec { - let d = self.datasets.to_lowercase(); - d.split(',').map(|s| s.to_string()).collect::>() - } - - pub fn get_compressors(&self) -> Vec { - let mut res = Vec::new(); - for s in self.compressors.to_lowercase().split(',') { - let parts = s.split(':').collect::>(); - let level = if parts.len() > 1 { - Some(parts[1].parse().unwrap()) - } else { - None - }; - res.push(match parts[0] { - "p" | "pco" | "pcodec" => { - let delta_encoding_order = if parts.len() > 2 { - parts[2].parse().unwrap() - } else { - AUTO_DELTA - }; - let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); - let config = pco::CompressorConfig::default() - .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) - .with_delta_encoding_order(delta_encoding_order) - .with_use_gcds(use_gcds); - MultiCompressorConfig::Pco(config) - } - "q" | "qco" | "q_compress" => { - let delta_encoding_order = if parts.len() > 2 { - parts[2].parse().unwrap() - } else { - AUTO_DELTA - }; - let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); - let config = q_compress::CompressorConfig::default() - .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) - .with_delta_encoding_order(delta_encoding_order) - .with_use_gcds(use_gcds); - MultiCompressorConfig::QCompress(config) - } - "zstd" => MultiCompressorConfig::ZStd(level.unwrap_or(3)), - _ => panic!("unknown compressor"), - }) - } - res - } -} - #[derive(Clone, Default)] struct BenchStat { pub dataset: String, @@ -257,12 +152,12 @@ fn decompress_qco(bytes: &[u8]) -> Vec { fn compress( raw_bytes: &[u8], nums: &[T], - config: &MultiCompressorConfig, -) -> (Duration, MultiCompressorConfig, Vec) { + config: &CodecConfig, +) -> (Duration, CodecConfig, Vec) { let t = Instant::now(); let mut qualified_config = config.clone(); let compressed = match &mut qualified_config { - MultiCompressorConfig::Pco(pco_conf) => { + CodecConfig::Pco(pco_conf) => { let mut conf = pco_conf.clone(); let pco_nums = T::slice_to_pco(nums); if conf.delta_encoding_order == AUTO_DELTA { @@ -272,7 +167,7 @@ fn compress( *pco_conf = conf.clone(); compress_pco(pco_nums, conf) } - MultiCompressorConfig::QCompress(qco_conf) => { + CodecConfig::QCompress(qco_conf) => { let mut conf = qco_conf.clone(); if conf.delta_encoding_order == AUTO_DELTA { conf.delta_encoding_order = @@ -281,7 +176,7 @@ fn compress( *qco_conf = conf.clone(); compress_qco(nums, conf) } - MultiCompressorConfig::ZStd(level) => { + CodecConfig::ZStd(level) => { let level = *level as i32; zstd::encode_all(raw_bytes, level).unwrap() } @@ -295,13 +190,13 @@ fn compress( fn decompress( compressed: &[u8], - config: &MultiCompressorConfig, + config: &CodecConfig, ) -> (Duration, Vec) { let t = Instant::now(); let rec_nums = match config { - MultiCompressorConfig::Pco(_) => decompress_pco(compressed), - MultiCompressorConfig::QCompress(_) => decompress_qco(compressed), - MultiCompressorConfig::ZStd(_) => { + CodecConfig::Pco(_) => decompress_pco(compressed), + CodecConfig::QCompress(_) => decompress_qco(compressed), + CodecConfig::ZStd(_) => { // to do justice to zstd, unsafely convert the bytes it writes into T // without copying let decoded_bytes = zstd::decode_all(compressed).unwrap(); @@ -314,7 +209,7 @@ fn decompress( fn warmup_iter( path: &Path, dataset: &str, - config: &MultiCompressorConfig, + config: &CodecConfig, opt: &Opt, ) -> Precomputed { // read in data @@ -376,7 +271,7 @@ fn warmup_iter( fn stats_iter( dataset: String, - config: &MultiCompressorConfig, + config: &CodecConfig, precomputed: &Precomputed, opt: &Opt, ) -> BenchStat { @@ -412,7 +307,7 @@ fn stats_iter( } } -fn handle(path: &Path, config: &MultiCompressorConfig, opt: &Opt) -> BenchStat { +fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> BenchStat { let dataset = basename_no_ext(path); let precomputed = warmup_iter::(path, &dataset, config, opt); @@ -457,20 +352,18 @@ fn main() { .map(|f| f.unwrap().path()) .collect::>(); paths.sort(); - let configs = opt.get_compressors(); - let datasets = opt.get_datasets(); let mut stats = Vec::new(); for path in paths { let path_str = path.to_str().unwrap(); - let keep = datasets + let keep = opt.datasets.is_empty() || opt.datasets .iter() - .any(|dataset| path_str.contains(dataset) || dataset == "all"); + .any(|dataset| path_str.contains(dataset)); if !keep { continue; } - for config in &configs { + for config in &opt.codecs { let stat = if path_str.contains("i64") || path_str.contains("micros") { handle::(&path, config, &opt) } else if path_str.contains("f64") { diff --git a/bench/src/opt.rs b/bench/src/opt.rs new file mode 100644 index 00000000..abefce50 --- /dev/null +++ b/bench/src/opt.rs @@ -0,0 +1,60 @@ +use clap::Parser; +use crate::codec_config::CodecConfig; + +pub const AUTO_DELTA: usize = usize::MAX; + +#[derive(Parser)] +pub struct Opt { + #[arg(long, short, default_value = "pco", value_parser=parse_codec, value_delimiter=',')] + pub codecs: Vec, + #[arg(long, short, default_value = "", value_delimiter=',')] + pub datasets: Vec, + #[arg(long, short, default_value = "10")] + pub iters: usize, + #[arg(long)] + pub no_compress: bool, + #[arg(long)] + pub no_decompress: bool, + #[arg(long)] + pub no_assertions: bool, +} + +pub fn parse_codec(s: &str) -> Result { + let parts = s.split(':').collect::>(); + let level = if parts.len() > 1 { + Some(parts[1].parse().unwrap()) + } else { + None + }; + match parts[0] { + "p" | "pco" | "pcodec" => { + let delta_encoding_order = if parts.len() > 2 { + parts[2].parse().unwrap() + } else { + AUTO_DELTA + }; + let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); + let config = pco::CompressorConfig::default() + .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) + .with_delta_encoding_order(delta_encoding_order) + .with_use_gcds(use_gcds); + Ok(CodecConfig::Pco(config)) + } + "q" | "qco" | "q_compress" => { + let delta_encoding_order = if parts.len() > 2 { + parts[2].parse().unwrap() + } else { + AUTO_DELTA + }; + let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); + let config = q_compress::CompressorConfig::default() + .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) + .with_delta_encoding_order(delta_encoding_order) + .with_use_gcds(use_gcds); + Ok(CodecConfig::QCompress(config)) + } + "zstd" => Ok(CodecConfig::ZStd(level.unwrap_or(3))), + _ => Err("unknown compressor"), + } +} + diff --git a/pco_cli/src/opt.rs b/pco_cli/src/opt.rs index b7750f51..eade8a36 100644 --- a/pco_cli/src/opt.rs +++ b/pco_cli/src/opt.rs @@ -7,7 +7,7 @@ use anyhow::Result; use crate::dtype::DType; #[derive(Clone, Debug, Parser)] -#[command(about = "A command line tool to compress, decompress, and inspect .pco files")] +#[command(about = "compress, decompress, and inspect .pco files")] pub struct OptWrapper { #[command(subcommand)] pub opt: Opt, @@ -21,6 +21,7 @@ pub enum Opt { } #[derive(Clone, Debug, Parser)] +#[command(about = "compress from a different format into standalone .pco")] pub struct CompressOpt { #[arg(long = "csv")] pub csv_path: Option, @@ -71,6 +72,7 @@ impl CompressOpt { } #[derive(Clone, Debug, Parser)] +#[command(about = "decompress from standalone .pco into stdout")] pub struct DecompressOpt { #[arg(long)] pub limit: Option, @@ -81,6 +83,7 @@ pub struct DecompressOpt { } #[derive(Clone, Debug, Parser)] +#[command(about = "print metadata about a standalone .pco file")] pub struct InspectOpt { pub path: PathBuf, } From d5c3d862aa9750c0a197ac848b2b1b7988121aa1 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 12:43:05 -0400 Subject: [PATCH 03/18] refactor almost working --- Cargo.lock | 5 +- bench/Cargo.toml | 1 + bench/src/codec_config.rs | 45 ----- bench/src/main.rs | 365 +++++++++++++------------------------- bench/src/opt.rs | 55 ++---- pco_cli/Cargo.toml | 2 +- 6 files changed, 141 insertions(+), 332 deletions(-) delete mode 100644 bench/src/codec_config.rs diff --git a/Cargo.lock b/Cargo.lock index e59d07cc..30ea736b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -90,9 +90,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.53" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" [[package]] name = "arrow" @@ -298,6 +298,7 @@ checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" name = "bench" version = "0.0.0" dependencies = [ + "anyhow", "clap", "pco", "q_compress", diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 71aba695..e8cc57a8 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -9,6 +9,7 @@ name = "bench" path = "src/main.rs" [dependencies] +anyhow = "1.0.71" clap = {version = "4.3.11", features = ["derive"]} pco = {path = "../pco" } q_compress = {path = "../quantile-compression/q_compress" } diff --git a/bench/src/codec_config.rs b/bench/src/codec_config.rs deleted file mode 100644 index 2b359c8e..00000000 --- a/bench/src/codec_config.rs +++ /dev/null @@ -1,45 +0,0 @@ -use std::fmt::{Display, Formatter}; - -#[derive(Clone, Debug)] -pub enum CodecConfig { - Pco(pco::CompressorConfig), - QCompress(q_compress::CompressorConfig), - ZStd(usize), -} - -impl CodecConfig { - pub fn codec(&self) -> &'static str { - match self { - CodecConfig::Pco(_) => "pco", - CodecConfig::QCompress(_) => "qco", - CodecConfig::ZStd(_) => "zstd", - } - } - - pub fn details(&self) -> String { - match self { - CodecConfig::Pco(config) => { - format!( - "{}:{}:{}", - config.compression_level, config.delta_encoding_order, config.use_gcds - ) - } - CodecConfig::QCompress(config) => { - format!( - "{}:{}:{}", - config.compression_level, config.delta_encoding_order, config.use_gcds - ) - } - CodecConfig::ZStd(level) => { - format!("{}", level) - } - } - } -} - -impl Display for CodecConfig { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}", self.codec(), self.details()) - } -} - diff --git a/bench/src/main.rs b/bench/src/main.rs index 9a9c025a..d4fbf5d9 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -1,11 +1,11 @@ #![allow(clippy::useless_transmute)] mod opt; -mod codec_config; +mod codecs; use std::fs; use std::io::ErrorKind; -use std::ops::AddAssign; +use std::ops::{AddAssign, Div}; use std::path::Path; use std::time::{Duration, Instant}; @@ -15,16 +15,16 @@ use tabled::settings::{Alignment, Modify, Style}; use tabled::{Table, Tabled}; use opt::Opt; -use codec_config::CodecConfig; -use pco::data_types::NumberLike as DNumberLike; +use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; +use crate::codecs::CodecConfig; use crate::opt::AUTO_DELTA; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order trait NumberLike: QNumberLike { - type Pco: DNumberLike; + type Pco: PNumberLike; fn slice_to_pco(slice: &[Self]) -> &[Self::Pco]; fn vec_from_pco(v: Vec) -> Vec; @@ -53,49 +53,45 @@ impl_pco_number_like!(TimestampMicros, i64); #[derive(Clone, Default)] struct BenchStat { - pub dataset: String, - pub codec: String, pub compress_dt: Duration, pub decompress_dt: Duration, pub compressed_size: usize, - pub iters: usize, } -#[derive(Tabled)] +fn median_duration(mut durations: Vec) -> Duration { + durations.sort_unstable(); + let lo = durations[durations.len() / 2]; + let hi = durations[(durations.len() + 1) / 2]; + (lo + hi) / 2 +} + +fn display_duration(duration: &Duration) -> String { + format!("{:?}", duration) +} + +#[derive(Tabled, Default)] struct PrintStat { pub dataset: String, pub codec: String, - pub compress_dt: String, - pub decompress_dt: String, + #[tabled(display_with = "display_duration")] + pub compress_dt: Duration, + #[tabled(display_with = "display_duration")] + pub decompress_dt: Duration, pub compressed_size: usize, } -impl AddAssign for BenchStat { - fn add_assign(&mut self, rhs: Self) { - self.compressed_size += rhs.compressed_size; - self.compress_dt += rhs.compress_dt; - self.decompress_dt += rhs.decompress_dt; - self.iters += rhs.iters; - } -} - -impl BenchStat { - fn normalize(&mut self) { - self.compressed_size /= self.iters; - self.compress_dt /= self.iters as u32; - self.decompress_dt /= self.iters as u32; - self.iters = 1; - } -} +impl PrintStat { + fn compute(dataset: String, codec: String, benches: &[BenchStat]) -> Self { + let compressed_size = benches[0].compressed_size; + let compress_dts = benches.iter().map(|bench| bench.compress_dt).collect::>(); + let decompress_dts = benches.iter().map(|bench| bench.decompress_dt).collect::>(); -impl From for PrintStat { - fn from(value: BenchStat) -> Self { PrintStat { - dataset: value.dataset, - codec: value.codec, - compressed_size: value.compressed_size, - compress_dt: format!("{:?}", value.compress_dt), - decompress_dt: format!("{:?}", value.decompress_dt), + dataset, + codec, + compressed_size, + compress_dt: median_duration(compress_dts), + decompress_dt: median_duration(decompress_dts), } } } @@ -112,231 +108,118 @@ fn basename_no_ext(path: &Path) -> String { } } -struct Precomputed { +struct Precomputed { raw_bytes: Vec, - nums: Vec, compressed: Vec, - codec: String, } -fn cast_to_nums(bytes: Vec) -> Vec { +fn cast_to_nums(bytes: &[u8]) -> &[T] { // Here we're assuming the bytes are in the right format for our data type. // For instance, chunks of 8 little-endian bytes on most platforms for // i64's. // This is fast and should work across platforms. - let n = bytes.len() / (T::PHYSICAL_BITS / 8); - unsafe { - let mut nums = std::mem::transmute::<_, Vec>(bytes); - nums.set_len(n); - nums - } -} - -fn compress_pco(nums: &[T], config: pco::CompressorConfig) -> Vec { - pco::standalone::simple_compress(config, nums) + unsafe { std::mem::transmute(bytes) } } -fn decompress_pco(bytes: &[u8]) -> Vec { - let v = pco::standalone::auto_decompress::(bytes).expect("could not decompress"); - T::vec_from_pco(v) -} - -fn compress_qco(nums: &[T], config: q_compress::CompressorConfig) -> Vec { - q_compress::Compressor::::from_config(config).simple_compress(nums) -} - -fn decompress_qco(bytes: &[u8]) -> Vec { - q_compress::auto_decompress(bytes).expect("could not decompress") -} - -fn compress( - raw_bytes: &[u8], - nums: &[T], - config: &CodecConfig, -) -> (Duration, CodecConfig, Vec) { - let t = Instant::now(); - let mut qualified_config = config.clone(); - let compressed = match &mut qualified_config { - CodecConfig::Pco(pco_conf) => { - let mut conf = pco_conf.clone(); - let pco_nums = T::slice_to_pco(nums); - if conf.delta_encoding_order == AUTO_DELTA { - conf.delta_encoding_order = - pco::auto_compressor_config(pco_nums, conf.compression_level).delta_encoding_order; - } - *pco_conf = conf.clone(); - compress_pco(pco_nums, conf) - } - CodecConfig::QCompress(qco_conf) => { - let mut conf = qco_conf.clone(); - if conf.delta_encoding_order == AUTO_DELTA { - conf.delta_encoding_order = - q_compress::auto_compressor_config(nums, conf.compression_level).delta_encoding_order; - } - *qco_conf = conf.clone(); - compress_qco(nums, conf) - } - CodecConfig::ZStd(level) => { - let level = *level as i32; - zstd::encode_all(raw_bytes, level).unwrap() - } - }; - ( - Instant::now() - t, - qualified_config, - compressed, - ) -} - -fn decompress( - compressed: &[u8], - config: &CodecConfig, -) -> (Duration, Vec) { - let t = Instant::now(); - let rec_nums = match config { - CodecConfig::Pco(_) => decompress_pco(compressed), - CodecConfig::QCompress(_) => decompress_qco(compressed), - CodecConfig::ZStd(_) => { - // to do justice to zstd, unsafely convert the bytes it writes into T - // without copying - let decoded_bytes = zstd::decode_all(compressed).unwrap(); - cast_to_nums(decoded_bytes) - } - }; - (Instant::now() - t, rec_nums) +fn cast_to_bytes(nums: &[T]) -> &[u8] { + // Here we're assuming the bytes are in the right format for our data type. + // For instance, chunks of 8 little-endian bytes on most platforms for + // i64's. + // This is fast and should work across platforms. + unsafe { std::mem::transmute(nums) } } -fn warmup_iter( - path: &Path, - dataset: &str, - config: &CodecConfig, - opt: &Opt, -) -> Precomputed { - // read in data - let raw_bytes = fs::read(path).expect("could not read"); - let nums = cast_to_nums(raw_bytes.clone()); - - // compress - let (_, qualified_config, compressed) = compress(&raw_bytes, &nums, config); - println!( - "\ndataset warmup: {} config: {:?}", - dataset, qualified_config - ); - println!("\tcompressed to {} bytes", compressed.len()); +// fn compress_pco(nums: &[T], config: pco::CompressorConfig) -> Vec { +// pco::standalone::simple_compress(config, nums) +// } +// +// fn compress_qco(nums: &[T], config: q_compress::CompressorConfig) -> Vec { +// q_compress::Compressor::::from_config(config).simple_compress(nums) +// } +// +// fn decompress_qco(bytes: &[u8]) -> Vec { +// q_compress::auto_decompress(bytes).expect("could not decompress") +// } +// +// fn compress( +// nums: &[T], +// config: &CodecConfig, +// ) -> (Duration, Vec) { +// let t = Instant::now(); +// let compressed = config.inner.compress(nums); +// // let compressed = match &mut qualified_config { +// // CodecConfig::Pco(pco_conf) => { +// // let mut conf = pco_conf.clone(); +// // let pco_nums = T::slice_to_pco(nums); +// // if conf.delta_encoding_order == AUTO_DELTA { +// // conf.delta_encoding_order = +// // pco::auto_compressor_config(pco_nums, conf.compression_level).delta_encoding_order; +// // } +// // *pco_conf = conf.clone(); +// // compress_pco(pco_nums, conf) +// // } +// // CodecConfig::QCompress(qco_conf) => { +// // let mut conf = qco_conf.clone(); +// // if conf.delta_encoding_order == AUTO_DELTA { +// // conf.delta_encoding_order = +// // q_compress::auto_compressor_config(nums, conf.compression_level).delta_encoding_order; +// // } +// // *qco_conf = conf.clone(); +// // compress_qco(nums, conf) +// // } +// // CodecConfig::ZStd(level) => { +// // let level = *level as i32; +// // zstd::encode_all(raw_bytes, level).unwrap() +// // } +// // }; +// ( +// Instant::now() - t, +// compressed, +// ) +// } + +// fn decompress( +// compressed: &[u8], +// config: &CodecConfig, +// ) -> (Duration, Vec) { +// let t = Instant::now(); +// let rec_nums = config.inner.decompress(compressed); +// // let rec_nums = match config { +// // CodecConfig::Pco(_) => decompress_pco(compressed), +// // CodecConfig::QCompress(_) => decompress_qco(compressed), +// // CodecConfig::ZStd(_) => { +// // // to do justice to zstd, unsafely convert the bytes it writes into T +// // // without copying +// // let decoded_bytes = zstd::decode_all(compressed).unwrap(); +// // cast_to_nums(decoded_bytes) +// // } +// // }; +// (Instant::now() - t, rec_nums) +// } + +fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { + let dataset = basename_no_ext(path); - // write to disk let mut fname = dataset.to_string(); fname.push('_'); - fname.push_str(&qualified_config.details()); - let output_dir = format!("{}/{}", BASE_DIR, config.codec()); - let output_path = format!("{}/{}.qco", output_dir, fname); - - match fs::create_dir(&output_dir) { - Ok(()) => (), - Err(e) => match e.kind() { - ErrorKind::AlreadyExists => (), - _ => panic!("{}", e), - }, - } - fs::write(output_path, &compressed).expect("couldn't write"); - - // decompress - let (_, rec_nums) = decompress::(&compressed, config); - - if !opt.no_assertions { - // make sure everything came back correct - if rec_nums.len() != nums.len() { - println!( - "original len: {} recovered len: {}", - nums.len(), - rec_nums.len() - ); - panic!("got back the wrong number of numbers!"); - } - for i in 0..rec_nums.len() { - if !rec_nums[i].num_eq(&nums[i]) { - println!("{} num {} -> {}", i, nums[i], rec_nums[i]); - panic!("failed to recover nums by compressing and decompressing!"); - } - } - } - - Precomputed { - raw_bytes, - nums, - compressed, - codec: qualified_config.to_string(), - } -} - -fn stats_iter( - dataset: String, - config: &CodecConfig, - precomputed: &Precomputed, - opt: &Opt, -) -> BenchStat { - // compress - let compress_dt = if !opt.no_compress { - let (dt, _, _) = compress( - &precomputed.raw_bytes, - &precomputed.nums, - config, - ); - println!("\tcompressed in {:?}", dt); - dt - } else { - Duration::ZERO - }; - - // decompress - let decompress_dt = if !opt.no_decompress { - let (dt, _) = decompress::(&precomputed.compressed, config); - println!("\tdecompressed in {:?}", dt); - dt - } else { - Duration::ZERO - }; - - BenchStat { - dataset, - codec: precomputed.codec.clone(), - compressed_size: precomputed.compressed.len(), - compress_dt, - decompress_dt, - iters: 1, - } -} - -fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> BenchStat { - let dataset = basename_no_ext(path); - - let precomputed = warmup_iter::(path, &dataset, config, opt); - let mut full_stat = BenchStat { - codec: config.codec().to_string(), - dataset: dataset.clone(), - ..Default::default() - }; + fname.push_str(&config.details()); + let precomputed = config.inner.warmup_iter(path, &dataset, &fname, &opt.handler_opt); + let mut benches = Vec::with_capacity(opt.iters); for _ in 0..opt.iters { - let iter_stat = stats_iter::(dataset.clone(), config, &precomputed, opt); - full_stat.codec = iter_stat.codec.clone(); // sometimes we get a more precise codec name - full_stat += iter_stat; + benches.push(config.inner.stats_iter(&dataset, &precomputed, &opt.handler_opt)); } - full_stat.normalize(); - full_stat + PrintStat::compute(dataset, config.to_string(), &benches) } -fn print_stats(stats: &[BenchStat]) { - let mut print_stats = stats - .iter() - .cloned() - .map(PrintStat::from) - .collect::>(); - let mut aggregate = BenchStat::default(); - for stat in stats { - aggregate += stat.clone(); +fn print_stats(mut stats: Vec) { + let mut aggregate = PrintStat::default(); + for stat in &stats { + aggregate.compressed_size += stat.compressed_size; + aggregate.compress_dt += stat.compress_dt; + aggregate.decompress_dt += stat.decompress_dt; } - print_stats.push(PrintStat::from(aggregate)); - let table = Table::new(print_stats) + stats.push(aggregate); + let table = Table::new(stats) .with(Style::rounded()) .with(Modify::new(Columns::new(2..)).with(Alignment::right())) .to_string(); @@ -382,5 +265,5 @@ fn main() { } } - print_stats(&stats); + print_stats(stats); } diff --git a/bench/src/opt.rs b/bench/src/opt.rs index abefce50..11d0810c 100644 --- a/bench/src/opt.rs +++ b/bench/src/opt.rs @@ -1,16 +1,25 @@ -use clap::Parser; -use crate::codec_config::CodecConfig; +use std::str::FromStr; + +use anyhow::{anyhow, Result}; +use clap::{Args, Parser}; +use crate::codecs::CodecConfig; pub const AUTO_DELTA: usize = usize::MAX; #[derive(Parser)] pub struct Opt { - #[arg(long, short, default_value = "pco", value_parser=parse_codec, value_delimiter=',')] + #[arg(long, short, default_value = "pco", value_parser=CodecConfig::from_str, value_delimiter=',')] pub codecs: Vec, #[arg(long, short, default_value = "", value_delimiter=',')] pub datasets: Vec, #[arg(long, short, default_value = "10")] pub iters: usize, + #[command(flatten)] + pub handler_opt: HandlerOpt, +} + +#[derive(Args)] +pub struct HandlerOpt { #[arg(long)] pub no_compress: bool, #[arg(long)] @@ -18,43 +27,3 @@ pub struct Opt { #[arg(long)] pub no_assertions: bool, } - -pub fn parse_codec(s: &str) -> Result { - let parts = s.split(':').collect::>(); - let level = if parts.len() > 1 { - Some(parts[1].parse().unwrap()) - } else { - None - }; - match parts[0] { - "p" | "pco" | "pcodec" => { - let delta_encoding_order = if parts.len() > 2 { - parts[2].parse().unwrap() - } else { - AUTO_DELTA - }; - let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); - let config = pco::CompressorConfig::default() - .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) - .with_delta_encoding_order(delta_encoding_order) - .with_use_gcds(use_gcds); - Ok(CodecConfig::Pco(config)) - } - "q" | "qco" | "q_compress" => { - let delta_encoding_order = if parts.len() > 2 { - parts[2].parse().unwrap() - } else { - AUTO_DELTA - }; - let use_gcds = !(parts.len() > 3 && &parts[3].to_lowercase()[0..3] == "off"); - let config = q_compress::CompressorConfig::default() - .with_compression_level(level.unwrap_or(q_compress::DEFAULT_COMPRESSION_LEVEL)) - .with_delta_encoding_order(delta_encoding_order) - .with_use_gcds(use_gcds); - Ok(CodecConfig::QCompress(config)) - } - "zstd" => Ok(CodecConfig::ZStd(level.unwrap_or(3))), - _ => Err("unknown compressor"), - } -} - diff --git a/pco_cli/Cargo.toml b/pco_cli/Cargo.toml index 331d222d..d0a52653 100644 --- a/pco_cli/Cargo.toml +++ b/pco_cli/Cargo.toml @@ -16,7 +16,7 @@ name = "pcodec" path = "src/main.rs" [dependencies] -anyhow = "1.0.53" +anyhow = "1.0.71" arrow = {version = "43.0.0", features = ["csv"], default-features=false} clap = {version = "4.3.11", features = ["derive"]} num-complex = "0.4.3" From 69d649382d537786299e517f8f64a7ca4c543259 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 12:43:25 -0400 Subject: [PATCH 04/18] include files --- bench/src/codecs/mod.rs | 246 ++++++++++++++++++++++++++++++++++++++++ bench/src/codecs/pco.rs | 66 +++++++++++ 2 files changed, 312 insertions(+) create mode 100644 bench/src/codecs/mod.rs create mode 100644 bench/src/codecs/pco.rs diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs new file mode 100644 index 00000000..160aa7c7 --- /dev/null +++ b/bench/src/codecs/mod.rs @@ -0,0 +1,246 @@ +mod pco; + +use std::fmt::{Debug, Display, Formatter}; +use std::{fs, mem}; +use std::io::ErrorKind; +use std::path::Path; +use std::str::FromStr; +use std::time::{Duration, Instant}; + +use anyhow::{anyhow, Result}; +use crate::codecs::pco::PcoConfig; +use crate::{BASE_DIR, BenchStat, cast_to_nums, NumberLike, Precomputed}; +use crate::opt::HandlerOpt; + +// Unfortunately we can't make a Box because it has generic +// functions, so we use a wrapping trait (CodecSurface) to manually dynamic +// dispatch. +trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { + fn name(&self) -> &'static str; + // panics if not found because that's a bug + fn get_conf(&self, key: &str) -> String; + fn set_conf(&mut self, key: &str, value: String) -> Result<()>; + + fn compress(&self, nums: &[T]) -> Vec; + fn decompress(&self, compressed: &[u8]) -> Vec; + + // sad manual dynamic dispatch, but at least we don't need all combinations + // of (dtype x codec) + fn compress_dynamic(&self, dtype: &str, raw_bytes: &[u8]) -> Vec { + unsafe { + match dtype { + "i64" => self.compress::(mem::transmute(raw_bytes)), + other => panic!("unknown dtype: {}", other), + } + } + } + + fn decompress_dynamic(&self, dtype: &str, compressed: &[u8]) -> Vec { + unsafe { + match dtype { + "i64" => mem::transmute(self.decompress::(compressed)), + other => panic!("unknown dtype: {}", other), + } + } + } +} + +pub trait CodecSurface: Debug + Send + Sync { + fn name(&self) -> &'static str; + // panics if not found because that's a bug + fn get_conf(&self, key: &str) -> String; + fn set_conf(&mut self, key: &str, value: String) -> Result<()>; + fn details(&self, confs: &[String]) -> String; + + fn warmup_iter( + &self, + path: &Path, + dataset: &str, + fname: &str, + opt: &HandlerOpt, + ) -> Precomputed; + + fn stats_iter( + &self, + dataset: &str, + precomputed: &Precomputed, + opt: &HandlerOpt, + ) -> BenchStat; + + fn clone_to_box(&self) -> Box; +} + +fn dtype_str(dataset: &str) -> &str { + dataset.split('_').next().unwrap() +} + +impl CodecSurface for C { + fn name(&self) -> &'static str { + self.name() + } + + fn get_conf(&self, key: &str) -> String { + self.get_conf(key) + } + + fn set_conf(&mut self, key: &str, value: String) -> Result<()> { + self.set_conf(key, value) + } + + fn details(&self, confs: &[String]) -> String { + let default = Self::default(); + let mut res = String::new(); + for k in confs { + let v = self.get_conf(k); + if v != default.get_conf(k) { + res.push_str(&format!( + ":{}={}", + k, + v, + )); + } + } + res + } + + fn warmup_iter(&self, path: &Path, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed { + // read in data + let raw_bytes = fs::read(path).expect("could not read"); + + // compress + let dtype = dtype_str(dataset); + let compressed = self.compress_dynamic(dtype, &raw_bytes); + println!( + "\nwarmup for {}: compressed to {} bytes", + dataset, + compressed.len(), + ); + + // write to disk + let output_dir = format!("{}/{}", BASE_DIR, self.name()); + let output_path = format!("{}/{}.{}", output_dir, fname, self.name()); + + match fs::create_dir(&output_dir) { + Ok(()) => (), + Err(e) => match e.kind() { + ErrorKind::AlreadyExists => (), + _ => panic!("{}", e), + }, + } + fs::write(output_path, &compressed).expect("couldn't write"); + + // decompress + let rec_raw_bytes = self.decompress_dynamic(dtype, &compressed); + + // TODO make this more informative + if !opt.no_assertions { + assert_eq!(rec_raw_bytes, raw_bytes); + } + + Precomputed { + raw_bytes, + compressed, + } + } + + fn stats_iter(&self, dataset: &str, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat { + let dtype = dtype_str(&dataset); + + // compress + let compress_dt = if !opt.no_compress { + let t = Instant::now(); + let _ = self.compress_dynamic( + dtype, + &precomputed.raw_bytes, + ); + let dt = Instant::now() - t; + println!("\tcompressed in {:?}", dt); + dt + } else { + Duration::ZERO + }; + + // decompress + let decompress_dt = if !opt.no_decompress { + let t = Instant::now(); + let _ = self.decompress_dynamic(dtype, &precomputed.compressed); + let dt = Instant::now() - t; + println!("\tdecompressed in {:?}", dt); + dt + } else { + Duration::ZERO + }; + + BenchStat { + compressed_size: precomputed.compressed.len(), + compress_dt, + decompress_dt, + } + } + + fn clone_to_box(&self) -> Box { + Box::new(self.clone()) + } +} + +#[derive(Debug)] +pub struct CodecConfig { + pub inner: Box, + pub confs: Vec, +} + +impl FromStr for CodecConfig { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let parts = s.split(':').collect::>(); + let name = parts[0]; + let mut confs = Vec::new(); + for &part in &parts[1..] { + let kv_vec = part.split('=').collect::>(); + if kv_vec.len() != 2 { + return Err(anyhow!("codec config {} is not a key=value pair", part)); + } + confs.push((kv_vec[0].to_string(), kv_vec[1].to_string())); + } + + let mut codec: Box = match name { + "p" | "pco" | "pcodec" => Box::new(PcoConfig::default()), + // "q" | "qco" | "q_compress" => Box::new(QcoConfig::default()), + // "zstd" => Box::new(ZstdConfig::default()), + _ => return Err(anyhow!("unknown codec: {}", name)), + }; + + for (k, v) in &confs { + codec.set_conf(k, v.to_string())?; + } + let mut confs = confs.into_iter().map(|(k, v)| k).collect::>(); + confs.sort_unstable(); + + Ok(Self { + inner: codec, + confs, + }) + } +} + +impl Display for CodecConfig { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.inner.name(), self.inner.details(&self.confs)) + } +} + +impl Clone for CodecConfig { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone_to_box(), + confs: self.confs.clone(), + } + } +} + +impl CodecConfig { + pub fn details(&self) -> String { + self.inner.details(&self.confs) + } +} \ No newline at end of file diff --git a/bench/src/codecs/pco.rs b/bench/src/codecs/pco.rs new file mode 100644 index 00000000..dd2ddab4 --- /dev/null +++ b/bench/src/codecs/pco.rs @@ -0,0 +1,66 @@ +use anyhow::{anyhow, Result}; +use crate::codecs::CodecInternal; +use crate::NumberLike; + +#[derive(Clone, Debug, Default)] +pub struct PcoConfig { + use_fixed_delta: bool, + compressor_config: pco::CompressorConfig, +} + +impl CodecInternal for PcoConfig { + fn name(&self) -> &'static str { + "pco" + } + + fn get_conf(&self, key: &str) -> String { + match key { + "level" => self.compressor_config.compression_level.to_string(), + "delta_order" => { + if self.use_fixed_delta { + self.compressor_config.delta_encoding_order.to_string() + } else { + "auto".to_string() + } + }, + "use_gcds" => self.compressor_config.use_gcds.to_string(), + "use_float_mult" => self.compressor_config.use_float_mult.to_string(), + _ => panic!("bad conf"), + } + } + + fn set_conf(&mut self, key: &str, value: String) -> Result<()> { + Ok(match key { + "level" => self.compressor_config.compression_level = value.parse::().unwrap(), + "delta_order" => { + if let Ok(order) = value.parse::() { + self.compressor_config.delta_encoding_order = order; + self.use_fixed_delta = true; + } else if value.to_lowercase() != "auto" { + return Err(anyhow!("cannot parse delta order: {}", value)) + } + }, + "use_gcds" => self.compressor_config.use_gcds = value.parse::().unwrap(), + "use_float_mult" => self.compressor_config.use_float_mult = value.parse::().unwrap(), + _ => { + return Err(anyhow!("unknown conf: {}", key)) + }, + }) + } + + fn compress(&self, nums: &[T]) -> Vec { + let mut c_config = self.compressor_config.clone(); + let pco_nums = T::slice_to_pco(nums); + if !self.use_fixed_delta { + c_config.delta_encoding_order = + pco::auto_compressor_config(pco_nums, c_config.compression_level).delta_encoding_order; + } + pco::standalone::simple_compress(c_config, pco_nums) + } + + fn decompress(&self, bytes: &[u8]) -> Vec { + let v = pco::standalone::auto_decompress::(bytes).expect("could not decompress"); + T::vec_from_pco(v) + } +} + From 613c796486395d4eed7e787e8f898407f92a6750 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 12:52:08 -0400 Subject: [PATCH 05/18] refactor working+clippy --- bench/src/codecs/mod.rs | 9 +++++---- bench/src/codecs/pco.rs | 5 +++-- bench/src/main.rs | 28 ++++++---------------------- bench/src/opt.rs | 4 +--- 4 files changed, 15 insertions(+), 31 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 160aa7c7..50aa267b 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -9,7 +9,7 @@ use std::time::{Duration, Instant}; use anyhow::{anyhow, Result}; use crate::codecs::pco::PcoConfig; -use crate::{BASE_DIR, BenchStat, cast_to_nums, NumberLike, Precomputed}; +use crate::{BASE_DIR, BenchStat, NumberLike, Precomputed}; use crate::opt::HandlerOpt; // Unfortunately we can't make a Box because it has generic @@ -35,6 +35,7 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { } } + #[allow(clippy::unsound_collection_transmute)] fn decompress_dynamic(&self, dtype: &str, compressed: &[u8]) -> Vec { unsafe { match dtype { @@ -144,7 +145,7 @@ impl CodecSurface for C { } fn stats_iter(&self, dataset: &str, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat { - let dtype = dtype_str(&dataset); + let dtype = dtype_str(dataset); // compress let compress_dt = if !opt.no_compress { @@ -205,7 +206,7 @@ impl FromStr for CodecConfig { } let mut codec: Box = match name { - "p" | "pco" | "pcodec" => Box::new(PcoConfig::default()), + "p" | "pco" | "pcodec" => Box::::default(), // "q" | "qco" | "q_compress" => Box::new(QcoConfig::default()), // "zstd" => Box::new(ZstdConfig::default()), _ => return Err(anyhow!("unknown codec: {}", name)), @@ -214,7 +215,7 @@ impl FromStr for CodecConfig { for (k, v) in &confs { codec.set_conf(k, v.to_string())?; } - let mut confs = confs.into_iter().map(|(k, v)| k).collect::>(); + let mut confs = confs.into_iter().map(|(k, _v)| k).collect::>(); confs.sort_unstable(); Ok(Self { diff --git a/bench/src/codecs/pco.rs b/bench/src/codecs/pco.rs index dd2ddab4..50eded07 100644 --- a/bench/src/codecs/pco.rs +++ b/bench/src/codecs/pco.rs @@ -30,7 +30,7 @@ impl CodecInternal for PcoConfig { } fn set_conf(&mut self, key: &str, value: String) -> Result<()> { - Ok(match key { + match key { "level" => self.compressor_config.compression_level = value.parse::().unwrap(), "delta_order" => { if let Ok(order) = value.parse::() { @@ -45,7 +45,8 @@ impl CodecInternal for PcoConfig { _ => { return Err(anyhow!("unknown conf: {}", key)) }, - }) + } + Ok(()) } fn compress(&self, nums: &[T]) -> Vec { diff --git a/bench/src/main.rs b/bench/src/main.rs index d4fbf5d9..e1087023 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -4,10 +4,10 @@ mod opt; mod codecs; use std::fs; -use std::io::ErrorKind; -use std::ops::{AddAssign, Div}; + + use std::path::Path; -use std::time::{Duration, Instant}; +use std::time::{Duration}; use clap::Parser; use tabled::settings::object::Columns; @@ -18,7 +18,7 @@ use opt::Opt; use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; use crate::codecs::CodecConfig; -use crate::opt::AUTO_DELTA; + const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order @@ -52,7 +52,7 @@ impl_pco_number_like!(f64, f64); impl_pco_number_like!(TimestampMicros, i64); #[derive(Clone, Default)] -struct BenchStat { +pub struct BenchStat { pub compress_dt: Duration, pub decompress_dt: Duration, pub compressed_size: usize, @@ -108,27 +108,11 @@ fn basename_no_ext(path: &Path) -> String { } } -struct Precomputed { +pub struct Precomputed { raw_bytes: Vec, compressed: Vec, } -fn cast_to_nums(bytes: &[u8]) -> &[T] { - // Here we're assuming the bytes are in the right format for our data type. - // For instance, chunks of 8 little-endian bytes on most platforms for - // i64's. - // This is fast and should work across platforms. - unsafe { std::mem::transmute(bytes) } -} - -fn cast_to_bytes(nums: &[T]) -> &[u8] { - // Here we're assuming the bytes are in the right format for our data type. - // For instance, chunks of 8 little-endian bytes on most platforms for - // i64's. - // This is fast and should work across platforms. - unsafe { std::mem::transmute(nums) } -} - // fn compress_pco(nums: &[T], config: pco::CompressorConfig) -> Vec { // pco::standalone::simple_compress(config, nums) // } diff --git a/bench/src/opt.rs b/bench/src/opt.rs index 11d0810c..45bbd703 100644 --- a/bench/src/opt.rs +++ b/bench/src/opt.rs @@ -1,10 +1,8 @@ use std::str::FromStr; -use anyhow::{anyhow, Result}; use clap::{Args, Parser}; -use crate::codecs::CodecConfig; -pub const AUTO_DELTA: usize = usize::MAX; +use crate::codecs::CodecConfig; #[derive(Parser)] pub struct Opt { From 70f1abb294b364819ec9d373ae55825910612f60 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 12:52:17 -0400 Subject: [PATCH 06/18] fmt --- bench/src/codecs/mod.rs | 48 ++++++++++++++++------------------------- bench/src/codecs/pco.rs | 16 +++++++------- bench/src/main.rs | 36 ++++++++++++++++++++----------- bench/src/opt.rs | 2 +- 4 files changed, 52 insertions(+), 50 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 50aa267b..7e400b1d 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -1,16 +1,16 @@ mod pco; use std::fmt::{Debug, Display, Formatter}; -use std::{fs, mem}; use std::io::ErrorKind; use std::path::Path; use std::str::FromStr; use std::time::{Duration, Instant}; +use std::{fs, mem}; -use anyhow::{anyhow, Result}; use crate::codecs::pco::PcoConfig; -use crate::{BASE_DIR, BenchStat, NumberLike, Precomputed}; use crate::opt::HandlerOpt; +use crate::{BenchStat, NumberLike, Precomputed, BASE_DIR}; +use anyhow::{anyhow, Result}; // Unfortunately we can't make a Box because it has generic // functions, so we use a wrapping trait (CodecSurface) to manually dynamic @@ -53,20 +53,9 @@ pub trait CodecSurface: Debug + Send + Sync { fn set_conf(&mut self, key: &str, value: String) -> Result<()>; fn details(&self, confs: &[String]) -> String; - fn warmup_iter( - &self, - path: &Path, - dataset: &str, - fname: &str, - opt: &HandlerOpt, - ) -> Precomputed; - - fn stats_iter( - &self, - dataset: &str, - precomputed: &Precomputed, - opt: &HandlerOpt, - ) -> BenchStat; + fn warmup_iter(&self, path: &Path, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed; + + fn stats_iter(&self, dataset: &str, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat; fn clone_to_box(&self) -> Box; } @@ -94,11 +83,7 @@ impl CodecSurface for C { for k in confs { let v = self.get_conf(k); if v != default.get_conf(k) { - res.push_str(&format!( - ":{}={}", - k, - v, - )); + res.push_str(&format!(":{}={}", k, v,)); } } res @@ -150,10 +135,7 @@ impl CodecSurface for C { // compress let compress_dt = if !opt.no_compress { let t = Instant::now(); - let _ = self.compress_dynamic( - dtype, - &precomputed.raw_bytes, - ); + let _ = self.compress_dynamic(dtype, &precomputed.raw_bytes); let dt = Instant::now() - t; println!("\tcompressed in {:?}", dt); dt @@ -200,7 +182,10 @@ impl FromStr for CodecConfig { for &part in &parts[1..] { let kv_vec = part.split('=').collect::>(); if kv_vec.len() != 2 { - return Err(anyhow!("codec config {} is not a key=value pair", part)); + return Err(anyhow!( + "codec config {} is not a key=value pair", + part + )); } confs.push((kv_vec[0].to_string(), kv_vec[1].to_string())); } @@ -227,7 +212,12 @@ impl FromStr for CodecConfig { impl Display for CodecConfig { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}", self.inner.name(), self.inner.details(&self.confs)) + write!( + f, + "{}:{}", + self.inner.name(), + self.inner.details(&self.confs) + ) } } @@ -244,4 +234,4 @@ impl CodecConfig { pub fn details(&self) -> String { self.inner.details(&self.confs) } -} \ No newline at end of file +} diff --git a/bench/src/codecs/pco.rs b/bench/src/codecs/pco.rs index 50eded07..0e3fe296 100644 --- a/bench/src/codecs/pco.rs +++ b/bench/src/codecs/pco.rs @@ -1,6 +1,6 @@ -use anyhow::{anyhow, Result}; use crate::codecs::CodecInternal; use crate::NumberLike; +use anyhow::{anyhow, Result}; #[derive(Clone, Debug, Default)] pub struct PcoConfig { @@ -22,7 +22,7 @@ impl CodecInternal for PcoConfig { } else { "auto".to_string() } - }, + } "use_gcds" => self.compressor_config.use_gcds.to_string(), "use_float_mult" => self.compressor_config.use_float_mult.to_string(), _ => panic!("bad conf"), @@ -37,14 +37,15 @@ impl CodecInternal for PcoConfig { self.compressor_config.delta_encoding_order = order; self.use_fixed_delta = true; } else if value.to_lowercase() != "auto" { - return Err(anyhow!("cannot parse delta order: {}", value)) + return Err(anyhow!( + "cannot parse delta order: {}", + value + )); } - }, + } "use_gcds" => self.compressor_config.use_gcds = value.parse::().unwrap(), "use_float_mult" => self.compressor_config.use_float_mult = value.parse::().unwrap(), - _ => { - return Err(anyhow!("unknown conf: {}", key)) - }, + _ => return Err(anyhow!("unknown conf: {}", key)), } Ok(()) } @@ -64,4 +65,3 @@ impl CodecInternal for PcoConfig { T::vec_from_pco(v) } } - diff --git a/bench/src/main.rs b/bench/src/main.rs index e1087023..bea7966c 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -1,24 +1,22 @@ #![allow(clippy::useless_transmute)] -mod opt; mod codecs; +mod opt; use std::fs; - use std::path::Path; -use std::time::{Duration}; +use std::time::Duration; use clap::Parser; use tabled::settings::object::Columns; use tabled::settings::{Alignment, Modify, Style}; use tabled::{Table, Tabled}; +use crate::codecs::CodecConfig; use opt::Opt; use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; -use crate::codecs::CodecConfig; - const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order @@ -83,8 +81,14 @@ struct PrintStat { impl PrintStat { fn compute(dataset: String, codec: String, benches: &[BenchStat]) -> Self { let compressed_size = benches[0].compressed_size; - let compress_dts = benches.iter().map(|bench| bench.compress_dt).collect::>(); - let decompress_dts = benches.iter().map(|bench| bench.decompress_dt).collect::>(); + let compress_dts = benches + .iter() + .map(|bench| bench.compress_dt) + .collect::>(); + let decompress_dts = benches + .iter() + .map(|bench| bench.decompress_dt) + .collect::>(); PrintStat { dataset, @@ -187,10 +191,16 @@ fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintS let mut fname = dataset.to_string(); fname.push('_'); fname.push_str(&config.details()); - let precomputed = config.inner.warmup_iter(path, &dataset, &fname, &opt.handler_opt); + let precomputed = config + .inner + .warmup_iter(path, &dataset, &fname, &opt.handler_opt); let mut benches = Vec::with_capacity(opt.iters); for _ in 0..opt.iters { - benches.push(config.inner.stats_iter(&dataset, &precomputed, &opt.handler_opt)); + benches.push( + config + .inner + .stats_iter(&dataset, &precomputed, &opt.handler_opt), + ); } PrintStat::compute(dataset, config.to_string(), &benches) } @@ -223,9 +233,11 @@ fn main() { let mut stats = Vec::new(); for path in paths { let path_str = path.to_str().unwrap(); - let keep = opt.datasets.is_empty() || opt.datasets - .iter() - .any(|dataset| path_str.contains(dataset)); + let keep = opt.datasets.is_empty() + || opt + .datasets + .iter() + .any(|dataset| path_str.contains(dataset)); if !keep { continue; } diff --git a/bench/src/opt.rs b/bench/src/opt.rs index 45bbd703..bf92bea4 100644 --- a/bench/src/opt.rs +++ b/bench/src/opt.rs @@ -8,7 +8,7 @@ use crate::codecs::CodecConfig; pub struct Opt { #[arg(long, short, default_value = "pco", value_parser=CodecConfig::from_str, value_delimiter=',')] pub codecs: Vec, - #[arg(long, short, default_value = "", value_delimiter=',')] + #[arg(long, short, default_value = "", value_delimiter = ',')] pub datasets: Vec, #[arg(long, short, default_value = "10")] pub iters: usize, From 85ebb27e58f27e49d237433f2488de26e6443ffb Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 14:40:22 -0400 Subject: [PATCH 07/18] actually working? --- bench/src/codecs/mod.rs | 87 +++++++++++++++++++++++------------------ bench/src/main.rs | 37 ++++++++---------- 2 files changed, 65 insertions(+), 59 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 7e400b1d..fd67c809 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -1,16 +1,21 @@ -mod pco; - +use std::{fs, mem}; use std::fmt::{Debug, Display, Formatter}; use std::io::ErrorKind; use std::path::Path; use std::str::FromStr; use std::time::{Duration, Instant}; -use std::{fs, mem}; +use anyhow::{anyhow, Result}; + +use q_compress::data_types::TimestampMicros; + +use crate::{BASE_DIR, BenchStat, dtype_str, NumberLike, Precomputed}; use crate::codecs::pco::PcoConfig; +use crate::num_vec::NumVec; use crate::opt::HandlerOpt; -use crate::{BenchStat, NumberLike, Precomputed, BASE_DIR}; -use anyhow::{anyhow, Result}; + +mod pco; +pub mod utils; // Unfortunately we can't make a Box because it has generic // functions, so we use a wrapping trait (CodecSurface) to manually dynamic @@ -26,22 +31,36 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { // sad manual dynamic dispatch, but at least we don't need all combinations // of (dtype x codec) - fn compress_dynamic(&self, dtype: &str, raw_bytes: &[u8]) -> Vec { - unsafe { - match dtype { - "i64" => self.compress::(mem::transmute(raw_bytes)), - other => panic!("unknown dtype: {}", other), - } + fn compress_dynamic(&self, num_vec: &NumVec) -> Vec { + match num_vec { + NumVec::I64(nums) => self.compress(nums), + NumVec::F64(nums) => self.compress(nums), + NumVec::Micros(nums) => self.compress(nums), } } - #[allow(clippy::unsound_collection_transmute)] - fn decompress_dynamic(&self, dtype: &str, compressed: &[u8]) -> Vec { - unsafe { - match dtype { - "i64" => mem::transmute(self.decompress::(compressed)), - other => panic!("unknown dtype: {}", other), - } + fn decompress_dynamic(&self, dtype: &str, compressed: &[u8]) -> NumVec { + match dtype { + "i64" => NumVec::I64(self.decompress::(compressed)), + "f64" => NumVec::F64(self.decompress::(compressed)), + "micros" => NumVec::Micros(self.decompress::(compressed)), + _ => panic!("unknown dtype {}", dtype), + } + } + + fn compare_nums(&self, recovered: &[T], original: &[T]) { + assert_eq!(recovered.len(), original.len()); + for (i, (x, y)) in recovered.iter().zip(original.iter()).enumerate() { + assert_eq!(x, y, "at {}", i); + } + } + + fn compare_nums_dynamic(&self, recovered: &NumVec, original: &NumVec) { + match (recovered, original) { + (NumVec::I64(x), NumVec::I64(y)) => self.compare_nums(x, y), + (NumVec::F64(x), NumVec::F64(y)) => self.compare_nums(x, y), + (NumVec::Micros(x), NumVec::Micros(y)) => self.compare_nums(x, y), + _ => panic!("should be unreachable"), } } } @@ -53,17 +72,12 @@ pub trait CodecSurface: Debug + Send + Sync { fn set_conf(&mut self, key: &str, value: String) -> Result<()>; fn details(&self, confs: &[String]) -> String; - fn warmup_iter(&self, path: &Path, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed; - - fn stats_iter(&self, dataset: &str, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat; + fn warmup_iter(&self, num_vec: &NumVec, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed; + fn stats_iter(&self, nums: &NumVec, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat; fn clone_to_box(&self) -> Box; } -fn dtype_str(dataset: &str) -> &str { - dataset.split('_').next().unwrap() -} - impl CodecSurface for C { fn name(&self) -> &'static str { self.name() @@ -89,13 +103,11 @@ impl CodecSurface for C { res } - fn warmup_iter(&self, path: &Path, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed { - // read in data - let raw_bytes = fs::read(path).expect("could not read"); + fn warmup_iter(&self, nums: &NumVec, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed { + let dtype = dtype_str(&dataset); // compress - let dtype = dtype_str(dataset); - let compressed = self.compress_dynamic(dtype, &raw_bytes); + let compressed = self.compress_dynamic(nums); println!( "\nwarmup for {}: compressed to {} bytes", dataset, @@ -116,26 +128,23 @@ impl CodecSurface for C { fs::write(output_path, &compressed).expect("couldn't write"); // decompress - let rec_raw_bytes = self.decompress_dynamic(dtype, &compressed); + let rec_nums = self.decompress_dynamic(dtype, &compressed); - // TODO make this more informative if !opt.no_assertions { - assert_eq!(rec_raw_bytes, raw_bytes); + self.compare_nums_dynamic(&rec_nums, &nums); } Precomputed { - raw_bytes, compressed, + dtype: dtype.to_string(), } } - fn stats_iter(&self, dataset: &str, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat { - let dtype = dtype_str(dataset); - + fn stats_iter(&self, nums: &NumVec, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat { // compress let compress_dt = if !opt.no_compress { let t = Instant::now(); - let _ = self.compress_dynamic(dtype, &precomputed.raw_bytes); + let _ = self.compress_dynamic(nums); let dt = Instant::now() - t; println!("\tcompressed in {:?}", dt); dt @@ -146,7 +155,7 @@ impl CodecSurface for C { // decompress let decompress_dt = if !opt.no_decompress { let t = Instant::now(); - let _ = self.decompress_dynamic(dtype, &precomputed.compressed); + let _ = self.decompress_dynamic(&precomputed.dtype, &precomputed.compressed); let dt = Instant::now() - t; println!("\tdecompressed in {:?}", dt); dt diff --git a/bench/src/main.rs b/bench/src/main.rs index bea7966c..3245a4ba 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -2,6 +2,7 @@ mod codecs; mod opt; +pub mod num_vec; use std::fs; @@ -17,11 +18,17 @@ use crate::codecs::CodecConfig; use opt::Opt; use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; +use crate::num_vec::NumVec; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order -trait NumberLike: QNumberLike { +fn dtype_str(dataset: &str) -> &str { + dataset.split('_').next().unwrap() +} + + +pub trait NumberLike: QNumberLike { type Pco: PNumberLike; fn slice_to_pco(slice: &[Self]) -> &[Self::Pco]; @@ -113,8 +120,8 @@ fn basename_no_ext(path: &Path) -> String { } pub struct Precomputed { - raw_bytes: Vec, compressed: Vec, + dtype: String, } // fn compress_pco(nums: &[T], config: pco::CompressorConfig) -> Vec { @@ -185,21 +192,25 @@ pub struct Precomputed { // (Instant::now() - t, rec_nums) // } -fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { +fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { let dataset = basename_no_ext(path); + let dtype = dtype_str(&dataset); + let mut fname = dataset.to_string(); fname.push('_'); fname.push_str(&config.details()); + let raw_bytes = fs::read(path).expect("could not read"); + let num_vec = NumVec::new(dtype, raw_bytes); let precomputed = config .inner - .warmup_iter(path, &dataset, &fname, &opt.handler_opt); + .warmup_iter(&num_vec, &dataset, &fname, &opt.handler_opt); let mut benches = Vec::with_capacity(opt.iters); for _ in 0..opt.iters { benches.push( config .inner - .stats_iter(&dataset, &precomputed, &opt.handler_opt), + .stats_iter(&num_vec, &precomputed, &opt.handler_opt), ); } PrintStat::compute(dataset, config.to_string(), &benches) @@ -243,21 +254,7 @@ fn main() { } for config in &opt.codecs { - let stat = if path_str.contains("i64") || path_str.contains("micros") { - handle::(&path, config, &opt) - } else if path_str.contains("f64") { - handle::(&path, config, &opt) - } else if path_str.contains("f32") { - handle::(&path, config, &opt) - } else if path_str.contains("micros") { - handle::(&path, config, &opt) - } else { - panic!( - "Could not determine dtype for file {}!", - path_str - ); - }; - stats.push(stat); + stats.push(handle(&path, config, &opt)); } } From af9c041011e824efd16f6ac8bfb8dd5852e416f5 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 14:40:37 -0400 Subject: [PATCH 08/18] new files --- bench/src/codecs/utils.rs | 22 ++++++++++++++++++++++ bench/src/num_vec.rs | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 bench/src/codecs/utils.rs create mode 100644 bench/src/num_vec.rs diff --git a/bench/src/codecs/utils.rs b/bench/src/codecs/utils.rs new file mode 100644 index 00000000..2ada3f6a --- /dev/null +++ b/bench/src/codecs/utils.rs @@ -0,0 +1,22 @@ +// use std::mem; +// use std::ptr::slice_from_raw_parts; +// use crate::NumberLike; +// +// pub fn byte_slice_to_nums(bytes: &[u8]) -> &[T] { +// let bytes_per_num = T::PHYSICAL_BITS / 8; +// unsafe { +// mem::transmute(slice_from_raw_parts(mem::transmute::<_, *const T>(bytes.as_ptr()), bytes.len() / bytes_per_num)) +// } +// } +// +// pub fn num_vec_to_bytes(nums: Vec) -> Vec { +// let bytes_per_num = T::PHYSICAL_BITS / 8; +// let byte_len = nums.len() * bytes_per_num; +// unsafe { +// Vec::from_raw_parts( +// mem::transmute::<_, *mut u8>(nums.as_ptr()), +// byte_len, +// byte_len, +// ) +// } +// } \ No newline at end of file diff --git a/bench/src/num_vec.rs b/bench/src/num_vec.rs new file mode 100644 index 00000000..d240d088 --- /dev/null +++ b/bench/src/num_vec.rs @@ -0,0 +1,30 @@ +use std::mem::transmute_copy; +use q_compress::data_types::TimestampMicros; +use crate::NumberLike; + +pub enum NumVec { + I64(Vec), + F64(Vec), + Micros(Vec), +} + +// very cursed! +fn byte_vec_to_nums(raw_bytes: Vec) -> Vec { + let bytes_per_num = T::PHYSICAL_BITS / 8; + unsafe { + let mut v: Vec = transmute_copy(&raw_bytes); + v.set_len(raw_bytes.len() / bytes_per_num); + v + } +} + +impl NumVec { + pub fn new(dtype: &str, raw_bytes: Vec) -> Self { + match dtype { + "i64" => NumVec::I64(byte_vec_to_nums(raw_bytes)), + "f64" => NumVec::F64(byte_vec_to_nums(raw_bytes)), + "micros" => NumVec::Micros(byte_vec_to_nums(raw_bytes)), + _ => panic!("unknown dtype {}", dtype), + } + } +} From c85a43ce9b65c4ae86545fad98bd159f5be72b65 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 14:45:58 -0400 Subject: [PATCH 09/18] fmt --- bench/src/codecs/mod.rs | 22 +++++++++++++++++----- bench/src/codecs/utils.rs | 2 +- bench/src/main.rs | 6 ++---- bench/src/num_vec.rs | 12 +++++------- pco_cli/src/opt.rs | 2 +- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index fd67c809..9737157e 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -1,7 +1,7 @@ -use std::{fs, mem}; use std::fmt::{Debug, Display, Formatter}; +use std::fs; use std::io::ErrorKind; -use std::path::Path; + use std::str::FromStr; use std::time::{Duration, Instant}; @@ -9,10 +9,10 @@ use anyhow::{anyhow, Result}; use q_compress::data_types::TimestampMicros; -use crate::{BASE_DIR, BenchStat, dtype_str, NumberLike, Precomputed}; use crate::codecs::pco::PcoConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; +use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; mod pco; pub mod utils; @@ -72,7 +72,13 @@ pub trait CodecSurface: Debug + Send + Sync { fn set_conf(&mut self, key: &str, value: String) -> Result<()>; fn details(&self, confs: &[String]) -> String; - fn warmup_iter(&self, num_vec: &NumVec, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed; + fn warmup_iter( + &self, + num_vec: &NumVec, + dataset: &str, + fname: &str, + opt: &HandlerOpt, + ) -> Precomputed; fn stats_iter(&self, nums: &NumVec, precomputed: &Precomputed, opt: &HandlerOpt) -> BenchStat; fn clone_to_box(&self) -> Box; @@ -103,7 +109,13 @@ impl CodecSurface for C { res } - fn warmup_iter(&self, nums: &NumVec, dataset: &str, fname: &str, opt: &HandlerOpt) -> Precomputed { + fn warmup_iter( + &self, + nums: &NumVec, + dataset: &str, + fname: &str, + opt: &HandlerOpt, + ) -> Precomputed { let dtype = dtype_str(&dataset); // compress diff --git a/bench/src/codecs/utils.rs b/bench/src/codecs/utils.rs index 2ada3f6a..3511432c 100644 --- a/bench/src/codecs/utils.rs +++ b/bench/src/codecs/utils.rs @@ -19,4 +19,4 @@ // byte_len, // ) // } -// } \ No newline at end of file +// } diff --git a/bench/src/main.rs b/bench/src/main.rs index 3245a4ba..aa337bae 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -1,8 +1,8 @@ #![allow(clippy::useless_transmute)] mod codecs; -mod opt; pub mod num_vec; +mod opt; use std::fs; @@ -15,10 +15,10 @@ use tabled::settings::{Alignment, Modify, Style}; use tabled::{Table, Tabled}; use crate::codecs::CodecConfig; +use crate::num_vec::NumVec; use opt::Opt; use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; -use crate::num_vec::NumVec; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order @@ -27,7 +27,6 @@ fn dtype_str(dataset: &str) -> &str { dataset.split('_').next().unwrap() } - pub trait NumberLike: QNumberLike { type Pco: PNumberLike; @@ -196,7 +195,6 @@ fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { let dataset = basename_no_ext(path); let dtype = dtype_str(&dataset); - let mut fname = dataset.to_string(); fname.push('_'); fname.push_str(&config.details()); diff --git a/bench/src/num_vec.rs b/bench/src/num_vec.rs index d240d088..d9b0a77e 100644 --- a/bench/src/num_vec.rs +++ b/bench/src/num_vec.rs @@ -1,6 +1,5 @@ -use std::mem::transmute_copy; -use q_compress::data_types::TimestampMicros; use crate::NumberLike; +use q_compress::data_types::TimestampMicros; pub enum NumVec { I64(Vec), @@ -11,11 +10,10 @@ pub enum NumVec { // very cursed! fn byte_vec_to_nums(raw_bytes: Vec) -> Vec { let bytes_per_num = T::PHYSICAL_BITS / 8; - unsafe { - let mut v: Vec = transmute_copy(&raw_bytes); - v.set_len(raw_bytes.len() / bytes_per_num); - v - } + raw_bytes + .chunks_exact(bytes_per_num) + .map(|chunk| T::from_bytes(chunk).unwrap()) + .collect::>() } impl NumVec { diff --git a/pco_cli/src/opt.rs b/pco_cli/src/opt.rs index eade8a36..48f813c4 100644 --- a/pco_cli/src/opt.rs +++ b/pco_cli/src/opt.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; -use clap::{Parser, Subcommand}; use anyhow::anyhow; use anyhow::Result; +use clap::{Parser, Subcommand}; use crate::dtype::DType; From 6b0c328ec93cec4059098e5cd7c08cd0c794754f Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 19:00:46 -0400 Subject: [PATCH 10/18] add other codecs back in --- bench/src/codecs/mod.rs | 12 +++++--- bench/src/codecs/qco.rs | 63 +++++++++++++++++++++++++++++++++++++++ bench/src/codecs/utils.rs | 37 ++++++++++------------- bench/src/codecs/zstd.rs | 52 ++++++++++++++++++++++++++++++++ bench/src/main.rs | 35 ++-------------------- bench/src/num_vec.rs | 1 - 6 files changed, 140 insertions(+), 60 deletions(-) create mode 100644 bench/src/codecs/qco.rs create mode 100644 bench/src/codecs/zstd.rs diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 9737157e..0f8ca282 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -13,8 +13,12 @@ use crate::codecs::pco::PcoConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; +use crate::codecs::qco::QcoConfig; +use crate::codecs::zstd::ZstdConfig; mod pco; +mod qco; +mod zstd; pub mod utils; // Unfortunately we can't make a Box because it has generic @@ -51,7 +55,7 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { fn compare_nums(&self, recovered: &[T], original: &[T]) { assert_eq!(recovered.len(), original.len()); for (i, (x, y)) in recovered.iter().zip(original.iter()).enumerate() { - assert_eq!(x, y, "at {}", i); + assert_eq!(x.to_unsigned(), y.to_unsigned(), "{} != {} at {}", x, y, i); } } @@ -213,8 +217,8 @@ impl FromStr for CodecConfig { let mut codec: Box = match name { "p" | "pco" | "pcodec" => Box::::default(), - // "q" | "qco" | "q_compress" => Box::new(QcoConfig::default()), - // "zstd" => Box::new(ZstdConfig::default()), + "q" | "qco" | "q_compress" => Box::::default(), + "zstd" => Box::::default(), _ => return Err(anyhow!("unknown codec: {}", name)), }; @@ -235,7 +239,7 @@ impl Display for CodecConfig { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, - "{}:{}", + "{}{}", self.inner.name(), self.inner.details(&self.confs) ) diff --git a/bench/src/codecs/qco.rs b/bench/src/codecs/qco.rs new file mode 100644 index 00000000..43dfea57 --- /dev/null +++ b/bench/src/codecs/qco.rs @@ -0,0 +1,63 @@ +use crate::codecs::CodecInternal; +use crate::NumberLike; +use anyhow::{anyhow, Result}; + +#[derive(Clone, Debug, Default)] +pub struct QcoConfig { + use_fixed_delta: bool, + compressor_config: q_compress::CompressorConfig, +} + +impl CodecInternal for QcoConfig { + fn name(&self) -> &'static str { + "qco" + } + + fn get_conf(&self, key: &str) -> String { + match key { + "level" => self.compressor_config.compression_level.to_string(), + "delta_order" => { + if self.use_fixed_delta { + self.compressor_config.delta_encoding_order.to_string() + } else { + "auto".to_string() + } + } + "use_gcds" => self.compressor_config.use_gcds.to_string(), + _ => panic!("bad conf"), + } + } + + fn set_conf(&mut self, key: &str, value: String) -> Result<()> { + match key { + "level" => self.compressor_config.compression_level = value.parse::().unwrap(), + "delta_order" => { + if let Ok(order) = value.parse::() { + self.compressor_config.delta_encoding_order = order; + self.use_fixed_delta = true; + } else if value.to_lowercase() != "auto" { + return Err(anyhow!( + "cannot parse delta order: {}", + value + )); + } + } + "use_gcds" => self.compressor_config.use_gcds = value.parse::().unwrap(), + _ => return Err(anyhow!("unknown conf: {}", key)), + } + Ok(()) + } + + fn compress(&self, nums: &[T]) -> Vec { + let mut c_config = self.compressor_config.clone(); + if !self.use_fixed_delta { + c_config.delta_encoding_order = + q_compress::auto_compressor_config(nums, c_config.compression_level).delta_encoding_order; + } + q_compress::standalone::Compressor::::from_config(c_config).simple_compress(nums) + } + + fn decompress(&self, bytes: &[u8]) -> Vec { + q_compress::auto_decompress::(bytes).expect("could not decompress") + } +} diff --git a/bench/src/codecs/utils.rs b/bench/src/codecs/utils.rs index 3511432c..0225231a 100644 --- a/bench/src/codecs/utils.rs +++ b/bench/src/codecs/utils.rs @@ -1,22 +1,15 @@ -// use std::mem; -// use std::ptr::slice_from_raw_parts; -// use crate::NumberLike; -// -// pub fn byte_slice_to_nums(bytes: &[u8]) -> &[T] { -// let bytes_per_num = T::PHYSICAL_BITS / 8; -// unsafe { -// mem::transmute(slice_from_raw_parts(mem::transmute::<_, *const T>(bytes.as_ptr()), bytes.len() / bytes_per_num)) -// } -// } -// -// pub fn num_vec_to_bytes(nums: Vec) -> Vec { -// let bytes_per_num = T::PHYSICAL_BITS / 8; -// let byte_len = nums.len() * bytes_per_num; -// unsafe { -// Vec::from_raw_parts( -// mem::transmute::<_, *mut u8>(nums.as_ptr()), -// byte_len, -// byte_len, -// ) -// } -// } +use std::mem; +use crate::NumberLike; + +// cursed ways to convert nums to bytes and back without doing work +pub unsafe fn num_slice_to_bytes(slice: &[T]) -> &[u8] { + let len = slice.len(); + let byte_len = len * (T::PHYSICAL_BITS / 8); + &*std::ptr::slice_from_raw_parts(mem::transmute::<_, *const u8>(slice.as_ptr()), byte_len) +} + +pub unsafe fn num_slice_to_bytes_mut(slice: &mut [T]) -> &mut [u8] { + let len = slice.len(); + let byte_len = len * (T::PHYSICAL_BITS / 8); + &mut *std::ptr::slice_from_raw_parts_mut(mem::transmute::<_, *mut u8>(slice.as_ptr()), byte_len) +} diff --git a/bench/src/codecs/zstd.rs b/bench/src/codecs/zstd.rs new file mode 100644 index 00000000..d5a41f64 --- /dev/null +++ b/bench/src/codecs/zstd.rs @@ -0,0 +1,52 @@ +use std::convert::TryInto; + +use anyhow::{anyhow, Result}; + +use crate::codecs::{CodecInternal, utils}; +use crate::NumberLike; + +#[derive(Clone, Debug, Default)] +pub struct ZstdConfig { + level: i32, +} + +impl CodecInternal for ZstdConfig { + fn name(&self) -> &'static str { + "zstd" + } + + fn get_conf(&self, key: &str) -> String { + match key { + "level" => self.level.to_string(), + _ => panic!("bad conf"), + } + } + + fn set_conf(&mut self, key: &str, value: String) -> Result<()> { + match key { + "level" => self.level = value.parse::().unwrap(), + _ => return Err(anyhow!("unknown conf: {}", key)), + } + Ok(()) + } + + // we prefix with a u32 of the + fn compress(&self, nums: &[T]) -> Vec { + let mut res = Vec::new(); + res.extend((nums.len() as u32).to_le_bytes()); + unsafe { + zstd::stream::copy_encode(utils::num_slice_to_bytes(nums), &mut res, self.level).unwrap(); + } + res + } + + fn decompress(&self, bytes: &[u8]) -> Vec { + let len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize; + let mut res = Vec::::with_capacity(len); + unsafe { + res.set_len(len); + zstd::stream::copy_decode(&bytes[4..], utils::num_slice_to_bytes_mut(res.as_mut_slice())).unwrap(); + } + res + } +} diff --git a/bench/src/main.rs b/bench/src/main.rs index aa337bae..12c42709 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -64,8 +64,8 @@ pub struct BenchStat { fn median_duration(mut durations: Vec) -> Duration { durations.sort_unstable(); - let lo = durations[durations.len() / 2]; - let hi = durations[(durations.len() + 1) / 2]; + let lo = durations[(durations.len() - 1) / 2]; + let hi = durations[durations.len() / 2]; (lo + hi) / 2 } @@ -123,18 +123,6 @@ pub struct Precomputed { dtype: String, } -// fn compress_pco(nums: &[T], config: pco::CompressorConfig) -> Vec { -// pco::standalone::simple_compress(config, nums) -// } -// -// fn compress_qco(nums: &[T], config: q_compress::CompressorConfig) -> Vec { -// q_compress::Compressor::::from_config(config).simple_compress(nums) -// } -// -// fn decompress_qco(bytes: &[u8]) -> Vec { -// q_compress::auto_decompress(bytes).expect("could not decompress") -// } -// // fn compress( // nums: &[T], // config: &CodecConfig, @@ -142,25 +130,6 @@ pub struct Precomputed { // let t = Instant::now(); // let compressed = config.inner.compress(nums); // // let compressed = match &mut qualified_config { -// // CodecConfig::Pco(pco_conf) => { -// // let mut conf = pco_conf.clone(); -// // let pco_nums = T::slice_to_pco(nums); -// // if conf.delta_encoding_order == AUTO_DELTA { -// // conf.delta_encoding_order = -// // pco::auto_compressor_config(pco_nums, conf.compression_level).delta_encoding_order; -// // } -// // *pco_conf = conf.clone(); -// // compress_pco(pco_nums, conf) -// // } -// // CodecConfig::QCompress(qco_conf) => { -// // let mut conf = qco_conf.clone(); -// // if conf.delta_encoding_order == AUTO_DELTA { -// // conf.delta_encoding_order = -// // q_compress::auto_compressor_config(nums, conf.compression_level).delta_encoding_order; -// // } -// // *qco_conf = conf.clone(); -// // compress_qco(nums, conf) -// // } // // CodecConfig::ZStd(level) => { // // let level = *level as i32; // // zstd::encode_all(raw_bytes, level).unwrap() diff --git a/bench/src/num_vec.rs b/bench/src/num_vec.rs index d9b0a77e..c7ebdc1b 100644 --- a/bench/src/num_vec.rs +++ b/bench/src/num_vec.rs @@ -7,7 +7,6 @@ pub enum NumVec { Micros(Vec), } -// very cursed! fn byte_vec_to_nums(raw_bytes: Vec) -> Vec { let bytes_per_num = T::PHYSICAL_BITS / 8; raw_bytes From be381dea978dda4bdace4250fe4dfd27a4ef2cff Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 19:01:03 -0400 Subject: [PATCH 11/18] fmt --- bench/src/codecs/mod.rs | 15 +++++++++++---- bench/src/codecs/utils.rs | 12 +++++++++--- bench/src/codecs/zstd.rs | 15 ++++++++++++--- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 0f8ca282..89b1a74f 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -10,16 +10,16 @@ use anyhow::{anyhow, Result}; use q_compress::data_types::TimestampMicros; use crate::codecs::pco::PcoConfig; +use crate::codecs::qco::QcoConfig; +use crate::codecs::zstd::ZstdConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; -use crate::codecs::qco::QcoConfig; -use crate::codecs::zstd::ZstdConfig; mod pco; mod qco; -mod zstd; pub mod utils; +mod zstd; // Unfortunately we can't make a Box because it has generic // functions, so we use a wrapping trait (CodecSurface) to manually dynamic @@ -55,7 +55,14 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { fn compare_nums(&self, recovered: &[T], original: &[T]) { assert_eq!(recovered.len(), original.len()); for (i, (x, y)) in recovered.iter().zip(original.iter()).enumerate() { - assert_eq!(x.to_unsigned(), y.to_unsigned(), "{} != {} at {}", x, y, i); + assert_eq!( + x.to_unsigned(), + y.to_unsigned(), + "{} != {} at {}", + x, + y, + i + ); } } diff --git a/bench/src/codecs/utils.rs b/bench/src/codecs/utils.rs index 0225231a..da2c1295 100644 --- a/bench/src/codecs/utils.rs +++ b/bench/src/codecs/utils.rs @@ -1,15 +1,21 @@ -use std::mem; use crate::NumberLike; +use std::mem; // cursed ways to convert nums to bytes and back without doing work pub unsafe fn num_slice_to_bytes(slice: &[T]) -> &[u8] { let len = slice.len(); let byte_len = len * (T::PHYSICAL_BITS / 8); - &*std::ptr::slice_from_raw_parts(mem::transmute::<_, *const u8>(slice.as_ptr()), byte_len) + &*std::ptr::slice_from_raw_parts( + mem::transmute::<_, *const u8>(slice.as_ptr()), + byte_len, + ) } pub unsafe fn num_slice_to_bytes_mut(slice: &mut [T]) -> &mut [u8] { let len = slice.len(); let byte_len = len * (T::PHYSICAL_BITS / 8); - &mut *std::ptr::slice_from_raw_parts_mut(mem::transmute::<_, *mut u8>(slice.as_ptr()), byte_len) + &mut *std::ptr::slice_from_raw_parts_mut( + mem::transmute::<_, *mut u8>(slice.as_ptr()), + byte_len, + ) } diff --git a/bench/src/codecs/zstd.rs b/bench/src/codecs/zstd.rs index d5a41f64..e677bbb9 100644 --- a/bench/src/codecs/zstd.rs +++ b/bench/src/codecs/zstd.rs @@ -2,7 +2,7 @@ use std::convert::TryInto; use anyhow::{anyhow, Result}; -use crate::codecs::{CodecInternal, utils}; +use crate::codecs::{utils, CodecInternal}; use crate::NumberLike; #[derive(Clone, Debug, Default)] @@ -35,7 +35,12 @@ impl CodecInternal for ZstdConfig { let mut res = Vec::new(); res.extend((nums.len() as u32).to_le_bytes()); unsafe { - zstd::stream::copy_encode(utils::num_slice_to_bytes(nums), &mut res, self.level).unwrap(); + zstd::stream::copy_encode( + utils::num_slice_to_bytes(nums), + &mut res, + self.level, + ) + .unwrap(); } res } @@ -45,7 +50,11 @@ impl CodecInternal for ZstdConfig { let mut res = Vec::::with_capacity(len); unsafe { res.set_len(len); - zstd::stream::copy_decode(&bytes[4..], utils::num_slice_to_bytes_mut(res.as_mut_slice())).unwrap(); + zstd::stream::copy_decode( + &bytes[4..], + utils::num_slice_to_bytes_mut(res.as_mut_slice()), + ) + .unwrap(); } res } From e1c4edd7ef484da8e9945cfdb164cd26ca48bb4d Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 19:30:15 -0400 Subject: [PATCH 12/18] fix byte reading... by resorting to the old hacks --- bench/README.md | 138 +++---------------------------------- bench/src/codecs/mod.rs | 3 +- bench/src/main.rs | 38 ---------- bench/src/num_vec.rs | 23 ++++--- pco/src/base_compressor.rs | 2 +- 5 files changed, 24 insertions(+), 180 deletions(-) diff --git a/bench/README.md b/bench/README.md index dadedd23..f2f5250f 100644 --- a/bench/README.md +++ b/bench/README.md @@ -1,14 +1,11 @@ # Benchmarks -This generates a wide variety of common distributions -with the `bool`, `f64`, `i64`, and `TimestampMicros` data types, +This generates a wide variety of common distributions, compresses them, decompresses them, and makes sure all the data came back bitwise identical. -We also compare vs -gzip, Snappy, ZStandard, and their combinations with Parquet -on the binary data of these numbers. -On all parquet files, we use the max compression level available for the -codec (9 for gzip, 22 for zstd). +It supports +* multiple codecs (pco, q_compress, zstd) +* multiple data types ## Running @@ -33,132 +30,13 @@ For instance, ``` shows floats sampled from a standard normal distribution. -Then to run quantile compression and decompression on each dataset, run +Then to run pco and decompression on each dataset, run `cargo run --release --bin bench`. This will show the compressed size and how long it took to compress and decompress each dataset. You can see the compressed files in `bench/data/pco/`. -You can try different configurations as well as q_compress and -ZStandard on any subset of the -datasets by specifying arguments; e.g. the following runs 3 iterations of -* `q_compress` level 12 with delta encoding order 1 and GCD's off -* and `zstd` level 22 - -on any datasets whose names match "near_linear" or "slow_cosine": -``` -cargo run --release --example primary -- \ - -c "qco:12:1:false,zstd:22" \ - -d "slow_cosine,near_linear", \ - -i 3 -``` - -When generating randoms, some comparison file formats were already generated, -like `.zstd.parquet` in `bench/data/zstd_parquet/`. - -To try pure gzip on the same data, -make sure you have `gzip` and `xargs` installed, -then simply run `sh bench/run_gzip.sh`. -This will use gzip to compress the binary version of the data at compression -levels 1 and 9. - -To try pure Snappy, -you can install the `szip` and `xargs` commands and run -`sh bench/run_snappy.sh`. - -## Comparing vs other algorithms - -You can compare file sizes with `ls`: -``` -% ls -lh bench/data/pco | awk '{print $5 "\t" $9}' -1.6M f64_decimal_8:0:true.qco -2.0M f64_diablo_long_8:0:true.qco -6.3K f64_diablo_short_8:0:true.qco -3.6M f64_integers_8:0:true.qco -6.6M f64_normal_at_0_8:0:true.qco -4.2M f64_normal_at_1M_8:0:true.qco -55B f64_radians_8:1:true.qco -1.8M f64_slow_cosine_8:5:true.qco -202K i64_bad_huffman_8:0:true.qco -432K i64_cents_8:0:true.qco -28B i64_constant_8:0:true.qco -2.6M i64_dist_shift_8:0:true.qco -605K i64_dollars_8:0:true.qco -245K i64_geo2_8:0:true.qco -1.2M i64_interl0_8:0:true.qco -2.2M i64_interl1_8:0:true.qco -2.2M i64_interl_scrambl1_8:0:true.qco -1.6M i64_lomax05_long_8:0:true.qco -5.3K i64_lomax05_short_8:0:true.qco -1.5M i64_lomax25_8:0:true.qco -185K i64_slow_cosine_8:2:true.qco -9.9K i64_sparse_8:0:true.qco -1.2M i64_total_cents_8:0:true.qco -7.6M i64_uniform_8:0:true.qco -3.6M micros_millis_8:0:true.qco -2.7M micros_near_linear_8:1:true.qco - -% ls -lh bench/data/zstd_parquet | awk '{print $5 "\t" $9}' -1.7M f64_decimal.zstd.parquet -2.0M f64_diablo_long.zstd.parquet -9.5K f64_diablo_short.zstd.parquet -4.9M f64_integers.zstd.parquet -7.6M f64_normal_at_0.zstd.parquet -5.4M f64_normal_at_1M.zstd.parquet -7.0M f64_radians.zstd.parquet -7.5M f64_slow_cosine.zstd.parquet -207K i64_bad_huffman.zstd.parquet -606K i64_cents.zstd.parquet -615B i64_constant.zstd.parquet -3.2M i64_dist_shift.zstd.parquet -888K i64_dollars.zstd.parquet -345K i64_geo2.zstd.parquet -1.3M i64_interl0.zstd.parquet -1.9M i64_interl1.zstd.parquet -2.3M i64_interl_scrambl1.zstd.parquet -2.3M i64_lomax05_long.zstd.parquet -11K i64_lomax05_short.zstd.parquet -1.8M i64_lomax25.zstd.parquet -1.8M i64_slow_cosine.zstd.parquet -17K i64_sparse.zstd.parquet -1.4M i64_total_cents.zstd.parquet -7.9M i64_uniform.zstd.parquet -5.9M micros_millis.zstd.parquet -3.3M micros_near_linear.zstd.parquet -``` - -In the above `ls` commands, -you can see that `.pco` files are typically a good deal smaller -than their corresponding `.zstd.parquet` files, -even though we're comparing a fast `q_compress` compression level with the -very highest zstd compresison level. - -Other than `.pco` and `.qco`, the best performing alternative was `.zstd.parquet`. -Some observations one can draw, comparing `.pco` to `.zstd.parquet`: -* In all cases `.pco` files are smaller. - On average about 27% smaller. -* With uniformly random data, there's not really any information to compress, - so both algorithms use close to the original file size of 7.6MB. -* Particularly interesting are the `cents`, `dollars`, and `total_cents` - distributions, which are meant to model the distribution of prices - at a retail store. - The cents are commonly 99, 98, 0, etc. - Quantile compression smooths over high-frequency information like this - when just given total cents (100 * dollars + cents), and only compresses - down to 1.29MB. - But given the two columns separately, it compresses down to - 620K + 441K = 1.04MB. -* However, if you run at the max `q_compress` level of 12 - (`cargo run --release --example primary 12`), - total cents drops to about 1.04MB, whereas dollars and cents separately - stay at 1.02MB. - So some suboptimal choices of data model can be compensated for via - increased compression level. -* Some float distributions can't be compressed much. - That's because between any power of 2, 64 bit floats use 52 bits of - information, which is already most of their 64 bits. - In other words, even a fairly tight distribution of floats can have high - entropy. - Integer distributions have low entropy much more commonly. - +Check `cargo run --release --bin bench -- --help` for information on how to +run other codecs, configure codecs differently, only run specific datasets, +etc. diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 89b1a74f..251b12e1 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -1,7 +1,6 @@ use std::fmt::{Debug, Display, Formatter}; use std::fs; use std::io::ErrorKind; - use std::str::FromStr; use std::time::{Duration, Instant}; @@ -9,12 +8,12 @@ use anyhow::{anyhow, Result}; use q_compress::data_types::TimestampMicros; +use crate::{BASE_DIR, BenchStat, dtype_str, NumberLike, Precomputed}; use crate::codecs::pco::PcoConfig; use crate::codecs::qco::QcoConfig; use crate::codecs::zstd::ZstdConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; -use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; mod pco; mod qco; diff --git a/bench/src/main.rs b/bench/src/main.rs index 12c42709..5cd0cc14 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -51,7 +51,6 @@ macro_rules! impl_pco_number_like { } impl_pco_number_like!(i64, i64); -impl_pco_number_like!(f32, f32); impl_pco_number_like!(f64, f64); impl_pco_number_like!(TimestampMicros, i64); @@ -123,43 +122,6 @@ pub struct Precomputed { dtype: String, } -// fn compress( -// nums: &[T], -// config: &CodecConfig, -// ) -> (Duration, Vec) { -// let t = Instant::now(); -// let compressed = config.inner.compress(nums); -// // let compressed = match &mut qualified_config { -// // CodecConfig::ZStd(level) => { -// // let level = *level as i32; -// // zstd::encode_all(raw_bytes, level).unwrap() -// // } -// // }; -// ( -// Instant::now() - t, -// compressed, -// ) -// } - -// fn decompress( -// compressed: &[u8], -// config: &CodecConfig, -// ) -> (Duration, Vec) { -// let t = Instant::now(); -// let rec_nums = config.inner.decompress(compressed); -// // let rec_nums = match config { -// // CodecConfig::Pco(_) => decompress_pco(compressed), -// // CodecConfig::QCompress(_) => decompress_qco(compressed), -// // CodecConfig::ZStd(_) => { -// // // to do justice to zstd, unsafely convert the bytes it writes into T -// // // without copying -// // let decoded_bytes = zstd::decode_all(compressed).unwrap(); -// // cast_to_nums(decoded_bytes) -// // } -// // }; -// (Instant::now() - t, rec_nums) -// } - fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { let dataset = basename_no_ext(path); let dtype = dtype_str(&dataset); diff --git a/bench/src/num_vec.rs b/bench/src/num_vec.rs index c7ebdc1b..3cadedb4 100644 --- a/bench/src/num_vec.rs +++ b/bench/src/num_vec.rs @@ -7,20 +7,25 @@ pub enum NumVec { Micros(Vec), } -fn byte_vec_to_nums(raw_bytes: Vec) -> Vec { - let bytes_per_num = T::PHYSICAL_BITS / 8; - raw_bytes - .chunks_exact(bytes_per_num) - .map(|chunk| T::from_bytes(chunk).unwrap()) - .collect::>() +fn cast_to_nums(bytes: Vec) -> Vec { + // Here we're assuming the bytes are in the right format for our data type. + // For instance, chunks of 8 little-endian bytes on most platforms for + // i64's. + // This is fast and should work across platforms. + let n = bytes.len() / (T::PHYSICAL_BITS / 8); + unsafe { + let mut nums = std::mem::transmute::<_, Vec>(bytes); + nums.set_len(n); + nums + } } impl NumVec { pub fn new(dtype: &str, raw_bytes: Vec) -> Self { match dtype { - "i64" => NumVec::I64(byte_vec_to_nums(raw_bytes)), - "f64" => NumVec::F64(byte_vec_to_nums(raw_bytes)), - "micros" => NumVec::Micros(byte_vec_to_nums(raw_bytes)), + "i64" => NumVec::I64(cast_to_nums(raw_bytes)), + "f64" => NumVec::F64(cast_to_nums(raw_bytes)), + "micros" => NumVec::Micros(cast_to_nums(raw_bytes)), _ => panic!("unknown dtype {}", dtype), } } diff --git a/pco/src/base_compressor.rs b/pco/src/base_compressor.rs index 93a47161..2d357d07 100644 --- a/pco/src/base_compressor.rs +++ b/pco/src/base_compressor.rs @@ -82,7 +82,7 @@ pub struct CompressorConfig { /// floats are approximately decimals (multiples of 0.01). /// /// When this is helpful, compression and decompression speeds are - /// substantially reduced (up to ~100%). In rare cases, this configuration + /// substantially reduced (up to ~50%). In rare cases, this configuration /// may reduce compression speed somewhat even when it isn't helpful. /// However, the compression ratio improvements tend to be quite large. pub use_float_mult: bool, From 2ba615c08d266e6b24dffe8f2be5decd1501d62f Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 19:50:00 -0400 Subject: [PATCH 13/18] snappy --- Cargo.lock | 5 ++-- bench/Cargo.toml | 1 + bench/run_single_snappy.sh | 1 - bench/run_snappy.sh | 4 --- bench/src/codecs/mod.rs | 9 ++++--- bench/src/codecs/snappy.rs | 52 ++++++++++++++++++++++++++++++++++++++ bench/src/main.rs | 1 + 7 files changed, 63 insertions(+), 10 deletions(-) delete mode 100644 bench/run_single_snappy.sh delete mode 100644 bench/run_snappy.sh create mode 100644 bench/src/codecs/snappy.rs diff --git a/Cargo.lock b/Cargo.lock index 30ea736b..c0003839 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -302,6 +302,7 @@ dependencies = [ "clap", "pco", "q_compress", + "snap", "tabled", "zstd", ] @@ -1312,9 +1313,9 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "snap" -version = "1.0.5" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" +checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" [[package]] name = "socket2" diff --git a/bench/Cargo.toml b/bench/Cargo.toml index e8cc57a8..a857e806 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -13,5 +13,6 @@ anyhow = "1.0.71" clap = {version = "4.3.11", features = ["derive"]} pco = {path = "../pco" } q_compress = {path = "../quantile-compression/q_compress" } +snap = "1.1.0" tabled = "0.12.2" zstd = "0.12" diff --git a/bench/run_single_snappy.sh b/bench/run_single_snappy.sh deleted file mode 100644 index ee47835a..00000000 --- a/bench/run_single_snappy.sh +++ /dev/null @@ -1 +0,0 @@ -szip -k -f "data/binary/$1" diff --git a/bench/run_snappy.sh b/bench/run_snappy.sh deleted file mode 100644 index 3007ab54..00000000 --- a/bench/run_snappy.sh +++ /dev/null @@ -1,4 +0,0 @@ -cd q_compress/examples -mkdir -p data/snappy -ls data/binary | xargs -I{} sh run_single_snappy.sh {} -mv data/binary/*.sz data/snappy/ diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 251b12e1..a6622436 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -8,15 +8,17 @@ use anyhow::{anyhow, Result}; use q_compress::data_types::TimestampMicros; -use crate::{BASE_DIR, BenchStat, dtype_str, NumberLike, Precomputed}; use crate::codecs::pco::PcoConfig; use crate::codecs::qco::QcoConfig; +use crate::codecs::snappy::SnappyConfig; use crate::codecs::zstd::ZstdConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; +use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; mod pco; mod qco; +mod snappy; pub mod utils; mod zstd; @@ -126,7 +128,7 @@ impl CodecSurface for C { fname: &str, opt: &HandlerOpt, ) -> Precomputed { - let dtype = dtype_str(&dataset); + let dtype = dtype_str(dataset); // compress let compressed = self.compress_dynamic(nums); @@ -153,7 +155,7 @@ impl CodecSurface for C { let rec_nums = self.decompress_dynamic(dtype, &compressed); if !opt.no_assertions { - self.compare_nums_dynamic(&rec_nums, &nums); + self.compare_nums_dynamic(&rec_nums, nums); } Precomputed { @@ -225,6 +227,7 @@ impl FromStr for CodecConfig { "p" | "pco" | "pcodec" => Box::::default(), "q" | "qco" | "q_compress" => Box::::default(), "zstd" => Box::::default(), + "snap" | "snappy" => Box::::default(), _ => return Err(anyhow!("unknown codec: {}", name)), }; diff --git a/bench/src/codecs/snappy.rs b/bench/src/codecs/snappy.rs new file mode 100644 index 00000000..31948e29 --- /dev/null +++ b/bench/src/codecs/snappy.rs @@ -0,0 +1,52 @@ +use std::convert::TryInto; +use std::io::{Read, Write}; + +use anyhow::{anyhow, Result}; + +use crate::codecs::{utils, CodecInternal}; +use crate::NumberLike; + +#[derive(Clone, Debug, Default)] +pub struct SnappyConfig {} + +impl CodecInternal for SnappyConfig { + fn name(&self) -> &'static str { + "snappy" + } + + fn get_conf(&self, _key: &str) -> String { + panic!("bad conf") + } + + fn set_conf(&mut self, key: &str, _value: String) -> Result<()> { + Err(anyhow!("unknown conf: {}", key)) + } + + // we prefix with a u32 of the + fn compress(&self, nums: &[T]) -> Vec { + let mut res = Vec::new(); + res.extend((nums.len() as u32).to_le_bytes()); + + unsafe { + let mut wtr = snap::write::FrameEncoder::new(&mut res); + wtr.write_all(utils::num_slice_to_bytes(nums)).unwrap(); + wtr.flush().unwrap(); + } + res + } + + fn decompress(&self, bytes: &[u8]) -> Vec { + let len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize; + let mut res = Vec::::with_capacity(len); + let mut rdr = snap::read::FrameDecoder::new(&bytes[4..]); + unsafe { + res.set_len(len); + rdr + .read_exact(utils::num_slice_to_bytes_mut( + res.as_mut_slice(), + )) + .unwrap(); + } + res + } +} diff --git a/bench/src/main.rs b/bench/src/main.rs index 5cd0cc14..e44f5116 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -1,4 +1,5 @@ #![allow(clippy::useless_transmute)] +#![allow(clippy::uninit_vec)] mod codecs; pub mod num_vec; From b489303472a715782070642a1c24d36cedc99d36 Mon Sep 17 00:00:00 2001 From: mwlon Date: Sun, 9 Jul 2023 20:04:43 -0400 Subject: [PATCH 14/18] flag explanations --- bench/run_gzip.sh | 6 ------ bench/run_single_gzip.sh | 1 - bench/src/opt.rs | 17 +++++++++++++++++ 3 files changed, 17 insertions(+), 7 deletions(-) delete mode 100644 bench/run_gzip.sh delete mode 100644 bench/run_single_gzip.sh diff --git a/bench/run_gzip.sh b/bench/run_gzip.sh deleted file mode 100644 index 906ed6ef..00000000 --- a/bench/run_gzip.sh +++ /dev/null @@ -1,6 +0,0 @@ -cd q_compress/examples -for level in 1 9; do - echo compressing for level $level - mkdir -p data/gzip_$level - ls data/binary | xargs -I{} sh run_single_gzip.sh $level {} -done diff --git a/bench/run_single_gzip.sh b/bench/run_single_gzip.sh deleted file mode 100644 index bb4644c8..00000000 --- a/bench/run_single_gzip.sh +++ /dev/null @@ -1 +0,0 @@ -gzip -c "data/binary/$2" -$1 > "data/gzip_$1/$2.gz" diff --git a/bench/src/opt.rs b/bench/src/opt.rs index bf92bea4..4fb1d974 100644 --- a/bench/src/opt.rs +++ b/bench/src/opt.rs @@ -6,10 +6,24 @@ use crate::codecs::CodecConfig; #[derive(Parser)] pub struct Opt { + /// Comma-separated list of codecs to benchmark, optionally with + /// colon-separated configurations. + /// + /// For example, setting this to + /// `zstd,zstd:level=10,pco:level=9:delta_order=0` + /// will compare 3 codecs: zstd at default compression level (3), zstd at + /// level 10, and pco at level 9 with 0th order delta encoding. + /// See the code in src/codecs/*.rs for configurations available to each + /// codec. #[arg(long, short, default_value = "pco", value_parser=CodecConfig::from_str, value_delimiter=',')] pub codecs: Vec, + /// Comma-separated substrings of datasets to benchmark. + /// By default all datasets are run. #[arg(long, short, default_value = "", value_delimiter = ',')] pub datasets: Vec, + /// Number of iterations to run each codec x dataset combination for + /// better estimation of durations. + /// The median duration is kept. #[arg(long, short, default_value = "10")] pub iters: usize, #[command(flatten)] @@ -22,6 +36,9 @@ pub struct HandlerOpt { pub no_compress: bool, #[arg(long)] pub no_decompress: bool, + /// Skip assertions that all the numbers came back bitwise identical. + /// + /// This does not affect benchmark timing. #[arg(long)] pub no_assertions: bool, } From 0b3eff8ec6a69363ceb97f5fe1091e43a889a54c Mon Sep 17 00:00:00 2001 From: mwlon Date: Mon, 10 Jul 2023 21:21:41 -0400 Subject: [PATCH 15/18] some progress toward parquet --- Cargo.lock | 1 + bench/Cargo.toml | 1 + bench/src/codecs/mod.rs | 10 ++++++---- bench/src/codecs/pco.rs | 6 +++--- bench/src/codecs/qco.rs | 6 +++--- bench/src/codecs/snappy.rs | 6 +++--- bench/src/codecs/utils.rs | 6 +++--- bench/src/codecs/zstd.rs | 6 +++--- bench/src/main.rs | 34 ++-------------------------------- bench/src/num_vec.rs | 4 ++-- 10 files changed, 27 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c0003839..5aec9432 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -300,6 +300,7 @@ version = "0.0.0" dependencies = [ "anyhow", "clap", + "parquet", "pco", "q_compress", "snap", diff --git a/bench/Cargo.toml b/bench/Cargo.toml index a857e806..a80ec03f 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -11,6 +11,7 @@ path = "src/main.rs" [dependencies] anyhow = "1.0.71" clap = {version = "4.3.11", features = ["derive"]} +parquet = {version = "43.0.0", features = ["snap", "zstd"], default-features=false} pco = {path = "../pco" } q_compress = {path = "../quantile-compression/q_compress" } snap = "1.1.0" diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index a6622436..e477372e 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -14,8 +14,10 @@ use crate::codecs::snappy::SnappyConfig; use crate::codecs::zstd::ZstdConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; -use crate::{dtype_str, BenchStat, NumberLike, Precomputed, BASE_DIR}; +use crate::{BASE_DIR, BenchStat, Precomputed}; +use crate::dtypes::{dtype_str, Dtype}; +mod parquet; mod pco; mod qco; mod snappy; @@ -31,8 +33,8 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { fn get_conf(&self, key: &str) -> String; fn set_conf(&mut self, key: &str, value: String) -> Result<()>; - fn compress(&self, nums: &[T]) -> Vec; - fn decompress(&self, compressed: &[u8]) -> Vec; + fn compress(&self, nums: &[T]) -> Vec; + fn decompress(&self, compressed: &[u8]) -> Vec; // sad manual dynamic dispatch, but at least we don't need all combinations // of (dtype x codec) @@ -53,7 +55,7 @@ trait CodecInternal: Clone + Debug + Send + Sync + Default + 'static { } } - fn compare_nums(&self, recovered: &[T], original: &[T]) { + fn compare_nums(&self, recovered: &[T], original: &[T]) { assert_eq!(recovered.len(), original.len()); for (i, (x, y)) in recovered.iter().zip(original.iter()).enumerate() { assert_eq!( diff --git a/bench/src/codecs/pco.rs b/bench/src/codecs/pco.rs index 0e3fe296..725fdaa4 100644 --- a/bench/src/codecs/pco.rs +++ b/bench/src/codecs/pco.rs @@ -1,5 +1,5 @@ use crate::codecs::CodecInternal; -use crate::NumberLike; +use crate::dtypes::Dtype; use anyhow::{anyhow, Result}; #[derive(Clone, Debug, Default)] @@ -50,7 +50,7 @@ impl CodecInternal for PcoConfig { Ok(()) } - fn compress(&self, nums: &[T]) -> Vec { + fn compress(&self, nums: &[T]) -> Vec { let mut c_config = self.compressor_config.clone(); let pco_nums = T::slice_to_pco(nums); if !self.use_fixed_delta { @@ -60,7 +60,7 @@ impl CodecInternal for PcoConfig { pco::standalone::simple_compress(c_config, pco_nums) } - fn decompress(&self, bytes: &[u8]) -> Vec { + fn decompress(&self, bytes: &[u8]) -> Vec { let v = pco::standalone::auto_decompress::(bytes).expect("could not decompress"); T::vec_from_pco(v) } diff --git a/bench/src/codecs/qco.rs b/bench/src/codecs/qco.rs index 43dfea57..7f4a653d 100644 --- a/bench/src/codecs/qco.rs +++ b/bench/src/codecs/qco.rs @@ -1,5 +1,5 @@ use crate::codecs::CodecInternal; -use crate::NumberLike; +use crate::dtypes::Dtype; use anyhow::{anyhow, Result}; #[derive(Clone, Debug, Default)] @@ -48,7 +48,7 @@ impl CodecInternal for QcoConfig { Ok(()) } - fn compress(&self, nums: &[T]) -> Vec { + fn compress(&self, nums: &[T]) -> Vec { let mut c_config = self.compressor_config.clone(); if !self.use_fixed_delta { c_config.delta_encoding_order = @@ -57,7 +57,7 @@ impl CodecInternal for QcoConfig { q_compress::standalone::Compressor::::from_config(c_config).simple_compress(nums) } - fn decompress(&self, bytes: &[u8]) -> Vec { + fn decompress(&self, bytes: &[u8]) -> Vec { q_compress::auto_decompress::(bytes).expect("could not decompress") } } diff --git a/bench/src/codecs/snappy.rs b/bench/src/codecs/snappy.rs index 31948e29..8934820b 100644 --- a/bench/src/codecs/snappy.rs +++ b/bench/src/codecs/snappy.rs @@ -4,7 +4,7 @@ use std::io::{Read, Write}; use anyhow::{anyhow, Result}; use crate::codecs::{utils, CodecInternal}; -use crate::NumberLike; +use crate::dtypes::Dtype; #[derive(Clone, Debug, Default)] pub struct SnappyConfig {} @@ -23,7 +23,7 @@ impl CodecInternal for SnappyConfig { } // we prefix with a u32 of the - fn compress(&self, nums: &[T]) -> Vec { + fn compress(&self, nums: &[T]) -> Vec { let mut res = Vec::new(); res.extend((nums.len() as u32).to_le_bytes()); @@ -35,7 +35,7 @@ impl CodecInternal for SnappyConfig { res } - fn decompress(&self, bytes: &[u8]) -> Vec { + fn decompress(&self, bytes: &[u8]) -> Vec { let len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize; let mut res = Vec::::with_capacity(len); let mut rdr = snap::read::FrameDecoder::new(&bytes[4..]); diff --git a/bench/src/codecs/utils.rs b/bench/src/codecs/utils.rs index da2c1295..77713325 100644 --- a/bench/src/codecs/utils.rs +++ b/bench/src/codecs/utils.rs @@ -1,8 +1,8 @@ -use crate::NumberLike; +use crate::dtypes::Dtype; use std::mem; // cursed ways to convert nums to bytes and back without doing work -pub unsafe fn num_slice_to_bytes(slice: &[T]) -> &[u8] { +pub unsafe fn num_slice_to_bytes(slice: &[T]) -> &[u8] { let len = slice.len(); let byte_len = len * (T::PHYSICAL_BITS / 8); &*std::ptr::slice_from_raw_parts( @@ -11,7 +11,7 @@ pub unsafe fn num_slice_to_bytes(slice: &[T]) -> &[u8] { ) } -pub unsafe fn num_slice_to_bytes_mut(slice: &mut [T]) -> &mut [u8] { +pub unsafe fn num_slice_to_bytes_mut(slice: &mut [T]) -> &mut [u8] { let len = slice.len(); let byte_len = len * (T::PHYSICAL_BITS / 8); &mut *std::ptr::slice_from_raw_parts_mut( diff --git a/bench/src/codecs/zstd.rs b/bench/src/codecs/zstd.rs index e677bbb9..a089369c 100644 --- a/bench/src/codecs/zstd.rs +++ b/bench/src/codecs/zstd.rs @@ -3,7 +3,7 @@ use std::convert::TryInto; use anyhow::{anyhow, Result}; use crate::codecs::{utils, CodecInternal}; -use crate::NumberLike; +use crate::dtypes::Dtype; #[derive(Clone, Debug, Default)] pub struct ZstdConfig { @@ -31,7 +31,7 @@ impl CodecInternal for ZstdConfig { } // we prefix with a u32 of the - fn compress(&self, nums: &[T]) -> Vec { + fn compress(&self, nums: &[T]) -> Vec { let mut res = Vec::new(); res.extend((nums.len() as u32).to_le_bytes()); unsafe { @@ -45,7 +45,7 @@ impl CodecInternal for ZstdConfig { res } - fn decompress(&self, bytes: &[u8]) -> Vec { + fn decompress(&self, bytes: &[u8]) -> Vec { let len = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as usize; let mut res = Vec::::with_capacity(len); unsafe { diff --git a/bench/src/main.rs b/bench/src/main.rs index e44f5116..fbb1ec65 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -4,6 +4,7 @@ mod codecs; pub mod num_vec; mod opt; +mod dtypes; use std::fs; @@ -24,37 +25,6 @@ use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order -fn dtype_str(dataset: &str) -> &str { - dataset.split('_').next().unwrap() -} - -pub trait NumberLike: QNumberLike { - type Pco: PNumberLike; - - fn slice_to_pco(slice: &[Self]) -> &[Self::Pco]; - fn vec_from_pco(v: Vec) -> Vec; -} - -macro_rules! impl_pco_number_like { - ($t: ty, $pco: ty) => { - impl NumberLike for $t { - type Pco = $pco; - - fn slice_to_pco(slice: &[$t]) -> &[Self::Pco] { - unsafe { std::mem::transmute(slice) } - } - - fn vec_from_pco(v: Vec) -> Vec { - unsafe { std::mem::transmute(v) } - } - } - }; -} - -impl_pco_number_like!(i64, i64); -impl_pco_number_like!(f64, f64); -impl_pco_number_like!(TimestampMicros, i64); - #[derive(Clone, Default)] pub struct BenchStat { pub compress_dt: Duration, @@ -125,7 +95,7 @@ pub struct Precomputed { fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { let dataset = basename_no_ext(path); - let dtype = dtype_str(&dataset); + let dtype = dtypes::dtype_str(&dataset); let mut fname = dataset.to_string(); fname.push('_'); diff --git a/bench/src/num_vec.rs b/bench/src/num_vec.rs index 3cadedb4..dff3df52 100644 --- a/bench/src/num_vec.rs +++ b/bench/src/num_vec.rs @@ -1,4 +1,4 @@ -use crate::NumberLike; +use crate::dtypes::Dtype; use q_compress::data_types::TimestampMicros; pub enum NumVec { @@ -7,7 +7,7 @@ pub enum NumVec { Micros(Vec), } -fn cast_to_nums(bytes: Vec) -> Vec { +fn cast_to_nums(bytes: Vec) -> Vec { // Here we're assuming the bytes are in the right format for our data type. // For instance, chunks of 8 little-endian bytes on most platforms for // i64's. From 69ff8ad11d57f98cd310114540c689e443481971 Mon Sep 17 00:00:00 2001 From: mwlon Date: Mon, 10 Jul 2023 21:59:08 -0400 Subject: [PATCH 16/18] parquet fully working --- Cargo.lock | 1 + bench/Cargo.toml | 1 + bench/src/codecs/mod.rs | 2 + bench/src/codecs/parquet.rs | 164 ++++++++++++++++++++++++++++++++++++ bench/src/dtypes.rs | 89 +++++++++++++++++++ 5 files changed, 257 insertions(+) create mode 100644 bench/src/codecs/parquet.rs create mode 100644 bench/src/dtypes.rs diff --git a/Cargo.lock b/Cargo.lock index 5aec9432..14d66f1f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -299,6 +299,7 @@ name = "bench" version = "0.0.0" dependencies = [ "anyhow", + "bytes", "clap", "parquet", "pco", diff --git a/bench/Cargo.toml b/bench/Cargo.toml index a80ec03f..33818d12 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -10,6 +10,7 @@ path = "src/main.rs" [dependencies] anyhow = "1.0.71" +bytes = "1.1.0" clap = {version = "4.3.11", features = ["derive"]} parquet = {version = "43.0.0", features = ["snap", "zstd"], default-features=false} pco = {path = "../pco" } diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index e477372e..a361d7a3 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -15,6 +15,7 @@ use crate::codecs::zstd::ZstdConfig; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; use crate::{BASE_DIR, BenchStat, Precomputed}; +use crate::codecs::parquet::ParquetConfig; use crate::dtypes::{dtype_str, Dtype}; mod parquet; @@ -230,6 +231,7 @@ impl FromStr for CodecConfig { "q" | "qco" | "q_compress" => Box::::default(), "zstd" => Box::::default(), "snap" | "snappy" => Box::::default(), + "parq" | "parquet" => Box::::default(), _ => return Err(anyhow!("unknown codec: {}", name)), }; diff --git a/bench/src/codecs/parquet.rs b/bench/src/codecs/parquet.rs new file mode 100644 index 00000000..e1e9db9d --- /dev/null +++ b/bench/src/codecs/parquet.rs @@ -0,0 +1,164 @@ +use std::convert::TryInto; +use std::sync::Arc; + +use anyhow::{anyhow, Result}; +use parquet::basic::{Compression, ZstdLevel}; +use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder, WriterPropertiesPtr}; +use parquet::file::reader::{SerializedFileReader, SerializedPageReader}; +use parquet::file::writer::{SerializedFileWriter, SerializedPageWriter}; +use parquet::schema::parser::parse_message_type; +use parquet::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath}; +use parquet::file::reader::{FileReader}; +use std::{fs::File, path::Path}; +use parquet::column::reader::get_typed_column_reader; + +use crate::codecs::{CodecInternal, utils}; +use crate::dtypes::Dtype; + +const ZSTD: &'static str = "zstd"; + +#[derive(Clone, Debug)] +pub struct ParquetConfig { + compression: Compression +} + +impl Default for ParquetConfig { + fn default() -> Self { + Self { + compression: Compression::UNCOMPRESSED + } + } +} + +fn str_to_compression(s: &str) -> Result { + let res = match s.to_lowercase().as_str() { + "uncompressed" => Compression::UNCOMPRESSED, + "snappy" => Compression::SNAPPY, + _ => { + if s.starts_with(ZSTD) { + let level = if s.len() > ZSTD.len() { + ZstdLevel::try_new(s[4..].to_string().parse::()?)? + } else { + ZstdLevel::default() + }; + Compression::ZSTD(level) + } else { + return Err(anyhow!("unknown parquet codec {}", s)) + } + } + }; + Ok(res) +} + +fn compression_to_string(compression: &Compression) -> String { + match compression { + Compression::UNCOMPRESSED => "uncompressed".to_string(), + Compression::SNAPPY => "snappy".to_string(), + Compression::ZSTD(level) => format!("{}{}", ZSTD, level.compression_level()), + _ => panic!("should be unreachable"), + } +} + +// This approach compresses the vector as +impl CodecInternal for ParquetConfig { + fn name(&self) -> &'static str { + "parquet" + } + + fn get_conf(&self, key: &str) -> String { + match key { + "compression" => compression_to_string(&self.compression), + _ => panic!("bad conf"), + } + } + + fn set_conf(&mut self, key: &str, value: String) -> Result<()> { + match key { + "compression" => self.compression = str_to_compression(&value)?, + _ => return Err(anyhow!("unknown conf: {}", key)), + } + Ok(()) + } + + fn compress(&self, nums: &[T]) -> Vec { + let mut res = Vec::new(); + let message_type = format!("message schema {{ REQUIRED {} nums; }}", T::PARQUET_DTYPE_STR); + let schema = Arc::new(parse_message_type(&message_type).unwrap()); + let mut writer = SerializedFileWriter::new( + &mut res, + schema, + Arc::new(WriterProperties::builder().set_compression(self.compression).build() + ) + ).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { + { + let typed = col_writer.typed::(); + typed.write_batch(T::slice_to_parquet(nums), None, None).unwrap(); + } + col_writer.close().unwrap() + } + row_group_writer.close().unwrap(); + writer.close().unwrap(); + + // let col_desc = ColumnDescPtr::new(ColumnDescriptor::new( + // T::parquet_type, + // 0, + // 0, + // ColumnPath::new(vec!["nums".to_string()]), + // )); + // let writer_properties = WriterPropertiesPtr::new(WriterProperties::builder() + // .set_compression(self.compression) + // .build() + // ); + // let page_writer = Box::new(SerializedPageWriter::new(&mut res)); + // let mut col_writer = parquet::column::writer::get_column_writer( + // col_desc, + // writer_properties, + // page_writer, + // ); + // let mut col_writer = parquet::column::writer::get_typed_column_writer::(col_writer); + // col_writer.write_batch(nums, None, None).unwrap(); + res + } + + fn decompress(&self, bytes: &[u8]) -> Vec { + // couldn't find a way to make a parquet reader without a fully copy of the compressed bytes; + // maybe this can be improved + let reader = SerializedFileReader::new(bytes::Bytes::from(bytes.to_vec())).unwrap(); + + let parquet_metadata = reader.metadata(); + let mut n = 0; + for row_group_meta in parquet_metadata.row_groups() { + n += row_group_meta.num_rows(); + } + + let mut res = Vec::with_capacity(n as usize); + unsafe { + res.set_len(n as usize); + } + for i in 0..parquet_metadata.num_row_groups() { + let row_group_reader = reader.get_row_group(i).unwrap(); + let mut col_reader = get_typed_column_reader::( + row_group_reader.get_column_reader(0).unwrap() + ); + col_reader.read_records(usize::MAX, None, None, &mut res).unwrap(); + } + // let col_desc = ColumnDescPtr::new(ColumnDescriptor::new( + // T::parquet_type, + // 0, + // 0, + // ColumnPath::new(vec!["nums".to_string()]), + // )); + // let page_reader = Box::new(SerializedPageReader::new( + // + // )); + // let col_reader = parquet::column::reader::get_column_reader( + // col_desc, + // page_reader, + // ); + // let mut col_reader = parquet::column::reader::get_typed_column_reader::(col_reader); + // col_reader.read_records(usize::MAX, None, None, &mut res).unwrap(); + T::vec_from_parquet(res) + } +} diff --git a/bench/src/dtypes.rs b/bench/src/dtypes.rs new file mode 100644 index 00000000..4e0a0585 --- /dev/null +++ b/bench/src/dtypes.rs @@ -0,0 +1,89 @@ +use std::mem; +use pco::data_types::NumberLike as PNumberLike; +use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; +use parquet::data_type as parq; + +pub fn dtype_str(dataset: &str) -> &str { + dataset.split('_').next().unwrap() +} + +pub trait Dtype: QNumberLike { + type Pco: PNumberLike; + type Parquet: parquet::data_type::DataType; + + const PARQUET_DTYPE_STR: &'static str; + + fn slice_to_parquet(slice: &[Self]) -> &[::T]; + fn slice_to_pco(slice: &[Self]) -> &[Self::Pco]; + fn vec_from_pco(v: Vec) -> Vec; + fn vec_from_parquet(v: Vec<::T>) -> Vec; +} + +impl Dtype for i64 { + type Pco = i64; + type Parquet = parq::Int64Type; + + const PARQUET_DTYPE_STR: &'static str = "INT64"; + + fn slice_to_parquet(slice: &[Self]) -> &[::T] { + slice + } + + fn slice_to_pco(slice: &[Self]) -> &[Self::Pco] { + slice + } + + fn vec_from_pco(v: Vec) -> Vec { + v + } + + fn vec_from_parquet(v: Vec) -> Vec { + v + } +} + +impl Dtype for f64 { + type Pco = f64; + type Parquet = parq::DoubleType; + + const PARQUET_DTYPE_STR: &'static str = "DOUBLE"; + + fn slice_to_parquet(slice: &[Self]) -> &[::T] { + slice + } + + fn slice_to_pco(slice: &[Self]) -> &[Self::Pco] { + slice + } + + fn vec_from_pco(v: Vec) -> Vec { + v + } + + fn vec_from_parquet(v: Vec) -> Vec { + v + } +} + +impl Dtype for TimestampMicros { + type Pco = i64; + type Parquet = parq::Int64Type; + + const PARQUET_DTYPE_STR: &'static str = "INT64"; + + fn slice_to_parquet(slice: &[Self]) -> &[::T] { + unsafe { mem::transmute(slice) } + } + + fn slice_to_pco(slice: &[Self]) -> &[Self::Pco] { + unsafe { mem::transmute(slice) } + } + + fn vec_from_pco(v: Vec) -> Vec { + unsafe { mem::transmute(v) } + } + + fn vec_from_parquet(v: Vec) -> Vec { + unsafe { mem::transmute(v) } + } +} From c04dbc088370ce6bf7d300cde2936bc5fcf3b9a2 Mon Sep 17 00:00:00 2001 From: mwlon Date: Mon, 10 Jul 2023 22:02:05 -0400 Subject: [PATCH 17/18] fmt and clippy --- bench/src/codecs/mod.rs | 6 ++-- bench/src/codecs/parquet.rs | 59 +++++++++++++++++++++---------------- bench/src/dtypes.rs | 4 +-- bench/src/main.rs | 4 +-- 4 files changed, 40 insertions(+), 33 deletions(-) diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index a361d7a3..66807138 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -8,15 +8,15 @@ use anyhow::{anyhow, Result}; use q_compress::data_types::TimestampMicros; +use crate::codecs::parquet::ParquetConfig; use crate::codecs::pco::PcoConfig; use crate::codecs::qco::QcoConfig; use crate::codecs::snappy::SnappyConfig; use crate::codecs::zstd::ZstdConfig; +use crate::dtypes::{dtype_str, Dtype}; use crate::num_vec::NumVec; use crate::opt::HandlerOpt; -use crate::{BASE_DIR, BenchStat, Precomputed}; -use crate::codecs::parquet::ParquetConfig; -use crate::dtypes::{dtype_str, Dtype}; +use crate::{BenchStat, Precomputed, BASE_DIR}; mod parquet; mod pco; diff --git a/bench/src/codecs/parquet.rs b/bench/src/codecs/parquet.rs index e1e9db9d..7d476b60 100644 --- a/bench/src/codecs/parquet.rs +++ b/bench/src/codecs/parquet.rs @@ -1,31 +1,30 @@ -use std::convert::TryInto; use std::sync::Arc; use anyhow::{anyhow, Result}; use parquet::basic::{Compression, ZstdLevel}; -use parquet::file::properties::{WriterProperties, WriterPropertiesBuilder, WriterPropertiesPtr}; -use parquet::file::reader::{SerializedFileReader, SerializedPageReader}; -use parquet::file::writer::{SerializedFileWriter, SerializedPageWriter}; +use parquet::file::properties::WriterProperties; +use parquet::file::reader::SerializedFileReader; +use parquet::file::writer::SerializedFileWriter; use parquet::schema::parser::parse_message_type; -use parquet::schema::types::{ColumnDescPtr, ColumnDescriptor, ColumnPath}; -use parquet::file::reader::{FileReader}; -use std::{fs::File, path::Path}; + +use parquet::file::reader::FileReader; + use parquet::column::reader::get_typed_column_reader; -use crate::codecs::{CodecInternal, utils}; +use crate::codecs::CodecInternal; use crate::dtypes::Dtype; -const ZSTD: &'static str = "zstd"; +const ZSTD: &str = "zstd"; #[derive(Clone, Debug)] pub struct ParquetConfig { - compression: Compression + compression: Compression, } impl Default for ParquetConfig { fn default() -> Self { Self { - compression: Compression::UNCOMPRESSED + compression: Compression::UNCOMPRESSED, } } } @@ -35,15 +34,15 @@ fn str_to_compression(s: &str) -> Result { "uncompressed" => Compression::UNCOMPRESSED, "snappy" => Compression::SNAPPY, _ => { - if s.starts_with(ZSTD) { - let level = if s.len() > ZSTD.len() { - ZstdLevel::try_new(s[4..].to_string().parse::()?)? - } else { + if let Some(zstd_level_str) = s.strip_prefix(ZSTD) { + let level = if zstd_level_str.is_empty() { ZstdLevel::default() + } else { + ZstdLevel::try_new(zstd_level_str.parse::()?)? }; Compression::ZSTD(level) } else { - return Err(anyhow!("unknown parquet codec {}", s)) + return Err(anyhow!("unknown parquet codec {}", s)); } } }; @@ -82,19 +81,28 @@ impl CodecInternal for ParquetConfig { fn compress(&self, nums: &[T]) -> Vec { let mut res = Vec::new(); - let message_type = format!("message schema {{ REQUIRED {} nums; }}", T::PARQUET_DTYPE_STR); + let message_type = format!( + "message schema {{ REQUIRED {} nums; }}", + T::PARQUET_DTYPE_STR + ); let schema = Arc::new(parse_message_type(&message_type).unwrap()); let mut writer = SerializedFileWriter::new( &mut res, schema, - Arc::new(WriterProperties::builder().set_compression(self.compression).build() - ) - ).unwrap(); + Arc::new( + WriterProperties::builder() + .set_compression(self.compression) + .build(), + ), + ) + .unwrap(); let mut row_group_writer = writer.next_row_group().unwrap(); while let Some(mut col_writer) = row_group_writer.next_column().unwrap() { { let typed = col_writer.typed::(); - typed.write_batch(T::slice_to_parquet(nums), None, None).unwrap(); + typed + .write_batch(T::slice_to_parquet(nums), None, None) + .unwrap(); } col_writer.close().unwrap() } @@ -139,10 +147,11 @@ impl CodecInternal for ParquetConfig { } for i in 0..parquet_metadata.num_row_groups() { let row_group_reader = reader.get_row_group(i).unwrap(); - let mut col_reader = get_typed_column_reader::( - row_group_reader.get_column_reader(0).unwrap() - ); - col_reader.read_records(usize::MAX, None, None, &mut res).unwrap(); + let mut col_reader = + get_typed_column_reader::(row_group_reader.get_column_reader(0).unwrap()); + col_reader + .read_records(usize::MAX, None, None, &mut res) + .unwrap(); } // let col_desc = ColumnDescPtr::new(ColumnDescriptor::new( // T::parquet_type, diff --git a/bench/src/dtypes.rs b/bench/src/dtypes.rs index 4e0a0585..f8ce237a 100644 --- a/bench/src/dtypes.rs +++ b/bench/src/dtypes.rs @@ -1,7 +1,7 @@ -use std::mem; +use parquet::data_type as parq; use pco::data_types::NumberLike as PNumberLike; use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; -use parquet::data_type as parq; +use std::mem; pub fn dtype_str(dataset: &str) -> &str { dataset.split('_').next().unwrap() diff --git a/bench/src/main.rs b/bench/src/main.rs index fbb1ec65..9840ea77 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -2,9 +2,9 @@ #![allow(clippy::uninit_vec)] mod codecs; +mod dtypes; pub mod num_vec; mod opt; -mod dtypes; use std::fs; @@ -19,8 +19,6 @@ use tabled::{Table, Tabled}; use crate::codecs::CodecConfig; use crate::num_vec::NumVec; use opt::Opt; -use pco::data_types::NumberLike as PNumberLike; -use q_compress::data_types::{NumberLike as QNumberLike, TimestampMicros}; const BASE_DIR: &str = "bench/data"; // if this delta order is specified, use a dataset-specific order From bcc5db51ac81cf247937de4fdd0a2832e13b4521 Mon Sep 17 00:00:00 2001 From: mwlon Date: Mon, 10 Jul 2023 22:13:03 -0400 Subject: [PATCH 18/18] fix filenames --- bench/generate_randoms.py | 15 --------------- bench/src/codecs/mod.rs | 2 +- bench/src/main.rs | 9 ++++++--- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/bench/generate_randoms.py b/bench/generate_randoms.py index 6310c1af..9fa0d4c6 100644 --- a/bench/generate_randoms.py +++ b/bench/generate_randoms.py @@ -2,8 +2,6 @@ # pip requirement: numpy, pyarrow import numpy as np -import pyarrow as pa -from pyarrow import parquet as pq from datetime import datetime import os @@ -13,27 +11,14 @@ base_dir = 'bench/data' os.makedirs(f'{base_dir}/txt', exist_ok=True) -os.makedirs(f'{base_dir}/parquet', exist_ok=True) -os.makedirs(f'{base_dir}/snappy_parquet', exist_ok=True) -os.makedirs(f'{base_dir}/gzip_parquet', exist_ok=True) -os.makedirs(f'{base_dir}/zstd_parquet', exist_ok=True) os.makedirs(f'{base_dir}/binary', exist_ok=True) -def write_parquet_tables(arr, full_name): - print(f'writing parquet for {full_name}...') - table = pa.Table.from_pydict({'nums': arr}) -# pq.write_table(table, f'{base_dir}/parquet/{full_name}.parquet', compression='NONE') -# pq.write_table(table, f'{base_dir}/snappy_parquet/{full_name}.snappy.parquet', compression='snappy') -# pq.write_table(table, f'{base_dir}/gzip_parquet/{full_name}.gzip.parquet', compression='gzip', compression_level=6) - pq.write_table(table, f'{base_dir}/zstd_parquet/{full_name}.zstd.parquet', compression='zstd', compression_level=3) - def write_generic(strs, arr, full_name): joined = '\n'.join(strs) with open(f'{base_dir}/txt/{full_name}.txt', 'w') as f: f.write(joined) with open(f'{base_dir}/binary/{full_name}.bin', 'wb') as f: f.write(arr.tobytes()) - write_parquet_tables(arr, full_name) def write_i64(arr, name): if arr.dtype != np.int64: diff --git a/bench/src/codecs/mod.rs b/bench/src/codecs/mod.rs index 66807138..698da432 100644 --- a/bench/src/codecs/mod.rs +++ b/bench/src/codecs/mod.rs @@ -143,7 +143,7 @@ impl CodecSurface for C { // write to disk let output_dir = format!("{}/{}", BASE_DIR, self.name()); - let output_path = format!("{}/{}.{}", output_dir, fname, self.name()); + let output_path = format!("{}/{}", output_dir, fname); match fs::create_dir(&output_dir) { Ok(()) => (), diff --git a/bench/src/main.rs b/bench/src/main.rs index 9840ea77..e63695e4 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -95,9 +95,12 @@ fn handle(path: &Path, config: &CodecConfig, opt: &Opt) -> PrintStat { let dataset = basename_no_ext(path); let dtype = dtypes::dtype_str(&dataset); - let mut fname = dataset.to_string(); - fname.push('_'); - fname.push_str(&config.details()); + let fname = format!( + "{}{}.{}", + &dataset, + config.details(), + config.inner.name(), + ); let raw_bytes = fs::read(path).expect("could not read"); let num_vec = NumVec::new(dtype, raw_bytes); let precomputed = config