From d1d5beb1ce7964f638d18d5a04e8ea441bd19695 Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Thu, 7 Mar 2024 20:07:24 +0100 Subject: [PATCH 1/8] Add own implementation of CAR reader --- Cargo.toml | 4 +-- src/lib.rs | 80 +++++++++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2f82206..530411c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,11 +16,11 @@ crate-type = ["rlib", "cdylib"] pyo3 = { version = "0.20", features = ["generate-import-lib", "anyhow"] } python3-dll-a = "0.2.7" anyhow = "1.0.75" -futures = "0.3" libipld = { version = "0.16.0", features = ["dag-cbor"] } -iroh-car = "0.4.0" multibase = "0.9" byteorder = "1.5.0" +leb128 = "0.2.1" +multihash = "0.18.1" [workspace] members = [ "profiling" ] diff --git a/src/lib.rs b/src/lib.rs index 2a98c38..ae4c55d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,30 +3,18 @@ use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write}; use ::libipld::cbor::{cbor, cbor::MajorKind, decode, encode}; use ::libipld::cbor::error::{LengthOutOfRange, NumberOutOfRange, UnknownTag}; use ::libipld::cid::Cid; +use ::libipld::cid::Version; +use ::libipld::cid::Result as CidResult; +use ::libipld::cid::Error as CidError; use anyhow::Result; use byteorder::{BigEndian, ByteOrder}; -use futures::{executor, stream::StreamExt}; -use iroh_car::{CarHeader, CarReader, Error as CarError}; use pyo3::{PyObject, Python}; use pyo3::conversion::ToPyObject; use pyo3::prelude::*; use pyo3::types::*; +use leb128; +use multihash::{Multihash}; -fn car_header_to_pydict<'py>(py: Python<'py>, header: &CarHeader) -> &'py PyDict { - let dict_obj = PyDict::new(py); - - dict_obj.set_item("version", header.version()).unwrap(); - - let roots = PyList::empty(py); - header.roots().iter().for_each(|cid| { - let cid_obj = cid.to_string().to_object(py); - roots.append(cid_obj).unwrap(); - }); - - dict_obj.set_item("roots", roots).unwrap(); - - dict_obj.into() -} fn cid_hash_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> &'py PyDict { let hash = cid.hash(); @@ -287,30 +275,54 @@ fn decode_dag_cbor_multi<'py>(py: Python<'py>, data: &[u8]) -> PyResult<&'py PyL Ok(decoded_parts) } -#[pyfunction] -pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(&'py PyDict, &'py PyDict)> { - let car_response = executor::block_on(CarReader::new(data)); - if let Err(e) = car_response { - return Err(get_err("Failed to decode CAR", e.to_string())); +fn read_cid_from_bytes(r: &mut R) -> CidResult { + let version = leb128::read::unsigned(r).unwrap(); + let codec = leb128::read::unsigned(r).unwrap(); + + if [version, codec] == [0x12, 0x20] { + let mut digest = [0u8; 32]; + r.read_exact(&mut digest)?; + let mh = Multihash::wrap(version, &digest).expect("Digest is always 32 bytes."); + return Cid::new_v0(mh); } - let car = car_response.unwrap(); + let version = Version::try_from(version)?; + match version { + Version::V0 => Err(CidError::InvalidCidVersion), + Version::V1 => { + let mh = Multihash::read(r)?; + Cid::new(version, codec, mh) + } + } +} - let header = car_header_to_pydict(py, car.header()); +#[pyfunction] +pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, &'py PyDict)> { + let buf = &mut BufReader::new(Cursor::new(data)); + + leb128::read::unsigned(buf).unwrap(); + let header = decode_dag_cbor_to_pyobject(py, buf, 0).unwrap(); let parsed_blocks = PyDict::new(py); - let blocks: Vec), CarError>> = executor::block_on(car.stream().collect()); - blocks.into_iter().for_each(|block| { - if let Ok((cid, bytes)) = block { - let py_object = decode_dag_cbor_to_pyobject(py, &mut BufReader::new(Cursor::new(bytes)), 0); - if let Ok(py_object) = py_object { - let key = cid.to_string().to_object(py); - parsed_blocks.set_item(key, py_object).unwrap(); - } + loop { + if let Err(_) = leb128::read::unsigned(buf) { + break; } - }); - Ok((header, parsed_blocks)) + let cid = read_cid_from_bytes(buf); + if let Err(_) = cid { + break; + } + + let block = decode_dag_cbor_to_pyobject(py, buf, 0); + if let Ok(block) = block { + parsed_blocks.set_item(cid.unwrap().to_string().to_object(py), block).unwrap(); + } else { + break; + } + } + + Ok((header.into(), parsed_blocks)) } #[pyfunction] From c4516c66614fc5e9739e5fb51fc0b8aa9deebed9 Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Thu, 7 Mar 2024 21:14:22 +0100 Subject: [PATCH 2/8] Implement own read_u64_leb128 --- Cargo.toml | 1 - src/lib.rs | 32 +++++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 530411c..c8b1e84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,6 @@ anyhow = "1.0.75" libipld = { version = "0.16.0", features = ["dag-cbor"] } multibase = "0.9" byteorder = "1.5.0" -leb128 = "0.2.1" multihash = "0.18.1" [workspace] diff --git a/src/lib.rs b/src/lib.rs index ae4c55d..fdd117f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,6 @@ use pyo3::{PyObject, Python}; use pyo3::conversion::ToPyObject; use pyo3::prelude::*; use pyo3::types::*; -use leb128; use multihash::{Multihash}; @@ -275,9 +274,32 @@ fn decode_dag_cbor_multi<'py>(py: Python<'py>, data: &[u8]) -> PyResult<&'py PyL Ok(decoded_parts) } +#[inline] +fn read_u64_leb128(r: &mut R) -> Result { + let mut result = 0; + let mut shift = 0; + + loop { + let mut buf = [0]; + if let Err(_) = r.read_exact(&mut buf) { + return Err(anyhow::anyhow!("Unexpected EOF while reading ULEB128 number.")); + } + + let byte = buf[0] as u64; + if (byte & 0x80) == 0 { + result |= (byte) << shift; + return Ok(result); + } else { + result |= (byte & 0x7F) << shift; + } + + shift += 7; + } +} + fn read_cid_from_bytes(r: &mut R) -> CidResult { - let version = leb128::read::unsigned(r).unwrap(); - let codec = leb128::read::unsigned(r).unwrap(); + let version = read_u64_leb128(r).unwrap(); + let codec = read_u64_leb128(r).unwrap(); if [version, codec] == [0x12, 0x20] { let mut digest = [0u8; 32]; @@ -300,12 +322,12 @@ fn read_cid_from_bytes(r: &mut R) -> CidResult { pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, &'py PyDict)> { let buf = &mut BufReader::new(Cursor::new(data)); - leb128::read::unsigned(buf).unwrap(); + let _ = read_u64_leb128(buf); let header = decode_dag_cbor_to_pyobject(py, buf, 0).unwrap(); let parsed_blocks = PyDict::new(py); loop { - if let Err(_) = leb128::read::unsigned(buf) { + if let Err(_) = read_u64_leb128(buf) { break; } From 572cc39da976e2ea7462968e4e71088f4295a32a Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 11:47:00 +0100 Subject: [PATCH 3/8] fix merging --- src/lib.rs | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f23fd09..31f9d12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,22 +15,6 @@ use pyo3::types::*; use pyo3::pybacked::PyBackedStr; use multihash::{Multihash}; -fn car_header_to_pydict<'py>(py: Python<'py>, header: &CarHeader) -> Bound<'py, PyDict> { - let dict_obj = PyDict::new_bound(py); - - dict_obj.set_item("version", header.version()).unwrap(); - - let roots = PyList::empty_bound(py); - header.roots().iter().for_each(|cid| { - let cid_obj = cid.to_string().to_object(py); - roots.append(cid_obj).unwrap(); - }); - - dict_obj.set_item("roots", roots).unwrap(); - - dict_obj -} - fn cid_hash_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> Bound<'py, PyDict> { let hash = cid.hash(); let dict_obj = PyDict::new_bound(py); @@ -339,7 +323,7 @@ fn read_cid_from_bytes(r: &mut R) -> CidResult { } #[pyfunction] -pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(Bound<'py, PyDict>, Bound<'py, PyDict>)> { +pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Bound<'py, PyDict>)> { let buf = &mut BufReader::new(Cursor::new(data)); let _ = read_u64_leb128(buf); From 9aa569960898e39081972a5f634e5f4d6f9e8f4e Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 12:13:01 +0100 Subject: [PATCH 4/8] code refactoring --- src/lib.rs | 99 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 31f9d12..6047e3d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,19 +1,16 @@ use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write}; -use ::libipld::cbor::{cbor, cbor::MajorKind, decode, encode}; use ::libipld::cbor::error::{LengthOutOfRange, NumberOutOfRange, UnknownTag}; -use ::libipld::cid::Cid; -use ::libipld::cid::Version; -use ::libipld::cid::Result as CidResult; -use ::libipld::cid::Error as CidError; -use anyhow::Result; +use ::libipld::cbor::{cbor, cbor::MajorKind, decode, encode}; +use ::libipld::cid::{Cid, Version, Result as CidResult, Error as CidError}; +use anyhow::{anyhow, Result}; use byteorder::{BigEndian, ByteOrder}; -use pyo3::{PyObject, Python}; +use multihash::Multihash; use pyo3::conversion::ToPyObject; use pyo3::prelude::*; -use pyo3::types::*; use pyo3::pybacked::PyBackedStr; -use multihash::{Multihash}; +use pyo3::types::*; +use pyo3::{PyObject, Python}; fn cid_hash_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> Bound<'py, PyDict> { let hash = cid.hash(); @@ -21,7 +18,9 @@ fn cid_hash_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> Bound<'py, PyDict> { dict_obj.set_item("code", hash.code()).unwrap(); dict_obj.set_item("size", hash.size()).unwrap(); - dict_obj.set_item("digest", PyBytes::new_bound(py, &hash.digest())).unwrap(); + dict_obj + .set_item("digest", PyBytes::new_bound(py, &hash.digest())) + .unwrap(); dict_obj } @@ -31,7 +30,9 @@ fn cid_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> Bound<'py, PyDict> { dict_obj.set_item("version", cid.version() as u64).unwrap(); dict_obj.set_item("codec", cid.codec()).unwrap(); - dict_obj.set_item("hash", cid_hash_to_pydict(py, cid)).unwrap(); + dict_obj + .set_item("hash", cid_hash_to_pydict(py, cid)) + .unwrap(); dict_obj } @@ -62,7 +63,8 @@ fn sort_map_keys(keys: &Bound, len: usize) -> Vec<(PyBackedStr, usiz keys_str.push((backed_str, i)); } - keys_str.sort_by(|a, b| { // sort_unstable_by performs bad + keys_str.sort_by(|a, b| { + // sort_unstable_by performs bad let (s1, _) = a; let (s2, _) = b; @@ -72,9 +74,13 @@ fn sort_map_keys(keys: &Bound, len: usize) -> Vec<(PyBackedStr, usiz keys_str } -fn decode_dag_cbor_to_pyobject(py: Python, r: &mut R, deep: usize) -> Result { +fn decode_dag_cbor_to_pyobject( + py: Python, + r: &mut R, + deep: usize, +) -> Result { let major = decode::read_major(r)?; - let py_object = match major.kind() { + Ok(match major.kind() { MajorKind::UnsignedInt => (decode::read_uint(r, major)?).to_object(py), MajorKind::NegativeInt => (-1 - decode::read_uint(r, major)? as i64).to_object(py), MajorKind::ByteString => { @@ -104,7 +110,7 @@ fn decode_dag_cbor_to_pyobject(py: Python, r: &mut R, deep: usiz // DAG-CBOR keys are always strings let key_major = decode::read_major(r)?; if key_major.kind() != MajorKind::TextString { - return Err(anyhow::anyhow!("Map keys must be strings")); + return Err(anyhow!("Map keys must be strings")); } let key_len = decode::read_uint(r, key_major)?; @@ -112,14 +118,14 @@ fn decode_dag_cbor_to_pyobject(py: Python, r: &mut R, deep: usiz if let Some(prev_key) = prev_key { if map_key_cmp(&prev_key, &key) == std::cmp::Ordering::Greater { - return Err(anyhow::anyhow!("Map keys must be sorted")); + return Err(anyhow!("Map keys must be sorted")); } } let key_py = key.to_object(py); prev_key = Some(key); if dict.get_item(&key_py)?.is_some() { - return Err(anyhow::anyhow!("Duplicate keys are not allowed")); + return Err(anyhow!("Duplicate keys are not allowed")); } let value = decode_dag_cbor_to_pyobject(py, r, deep + 1)?; @@ -131,7 +137,7 @@ fn decode_dag_cbor_to_pyobject(py: Python, r: &mut R, deep: usiz MajorKind::Tag => { let value = decode::read_uint(r, major)?; if value != 42 { - return Err(anyhow::anyhow!("Non-42 tags are not supported")); + return Err(anyhow!("Non-42 tags are not supported")); } let cid = decode::read_link(r)?.to_string(); @@ -143,13 +149,16 @@ fn decode_dag_cbor_to_pyobject(py: Python, r: &mut R, deep: usiz cbor::NULL => py.None(), cbor::F32 => decode::read_f32(r)?.to_object(py), cbor::F64 => decode::read_f64(r)?.to_object(py), - _ => return Err(anyhow::anyhow!(format!("Unsupported major type"))), + _ => return Err(anyhow!(format!("Unsupported major type"))), }, - }; - Ok(py_object) + }) } -fn encode_dag_cbor_from_pyobject<'py, W: Write>(py: Python<'py>, obj: &Bound<'py, PyAny>, w: &mut W) -> Result<()> { +fn encode_dag_cbor_from_pyobject<'py, W: Write>( + py: Python<'py>, + obj: &Bound<'py, PyAny>, + w: &mut W, +) -> Result<()> { /* Order is important for performance! Fast checks go first: @@ -169,7 +178,11 @@ fn encode_dag_cbor_from_pyobject<'py, W: Write>(py: Python<'py>, obj: &Bound<'py Ok(()) } else if obj.is_instance_of::() { - let buf = if obj.is_truthy()? { [cbor::TRUE.into()] } else { [cbor::FALSE.into()] }; + let buf = if obj.is_truthy()? { + [cbor::TRUE.into()] + } else { + [cbor::FALSE.into()] + }; w.write_all(&buf)?; Ok(()) @@ -257,7 +270,7 @@ fn encode_dag_cbor_from_pyobject<'py, W: Write>(py: Python<'py>, obj: &Bound<'py Ok(()) } } else { - return Err(UnknownTag(0).into()); + Err(UnknownTag(0).into()) } } @@ -286,7 +299,9 @@ fn read_u64_leb128(r: &mut R) -> Result { loop { let mut buf = [0]; if let Err(_) = r.read_exact(&mut buf) { - return Err(anyhow::anyhow!("Unexpected EOF while reading ULEB128 number.")); + return Err(anyhow!( + "Unexpected EOF while reading ULEB128 number." + )); } let byte = buf[0] as u64; @@ -342,7 +357,9 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun let block = decode_dag_cbor_to_pyobject(py, buf, 0); if let Ok(block) = block { - parsed_blocks.set_item(cid.unwrap().to_string().to_object(py), block).unwrap(); + parsed_blocks + .set_item(cid.unwrap().to_string().to_object(py), block) + .unwrap(); } else { break; } @@ -357,12 +374,18 @@ fn decode_dag_cbor(py: Python, data: &[u8]) -> PyResult { if let Ok(py_object) = py_object { Ok(py_object) } else { - Err(get_err("Failed to decode DAG-CBOR", py_object.unwrap_err().to_string())) + Err(get_err( + "Failed to decode DAG-CBOR", + py_object.unwrap_err().to_string(), + )) } } #[pyfunction] -fn encode_dag_cbor<'py>(py: Python<'py>, data: &Bound<'py, PyAny>) -> PyResult> { +fn encode_dag_cbor<'py>( + py: Python<'py>, + data: &Bound<'py, PyAny>, +) -> PyResult> { let mut buf = &mut BufWriter::new(Vec::new()); if let Err(e) = encode_dag_cbor_from_pyobject(py, data, &mut buf) { return Err(get_err("Failed to encode DAG-CBOR", e.to_string())); @@ -379,7 +402,10 @@ fn decode_cid<'py>(py: Python<'py>, data: &str) -> PyResult> if let Ok(cid) = cid { Ok(cid_to_pydict(py, &cid)) } else { - Err(get_err("Failed to decode CID", cid.unwrap_err().to_string())) + Err(get_err( + "Failed to decode CID", + cid.unwrap_err().to_string(), + )) } } @@ -389,7 +415,10 @@ fn decode_multibase<'py>(py: Python<'py>, data: &str) -> PyResult<(char, Bound<' if let Ok((base, data)) = base { Ok((base.code(), PyBytes::new_bound(py, &data))) } else { - Err(get_err("Failed to decode multibase", base.unwrap_err().to_string())) + Err(get_err( + "Failed to decode multibase", + base.unwrap_err().to_string(), + )) } } @@ -406,14 +435,20 @@ fn encode_multibase(code: char, data: &Bound) -> PyResult { let s = data.downcast::().unwrap(); data_bytes = s.to_str()?.as_bytes(); } else { - return Err(get_err("Failed to encode multibase", "Unsupported data type".to_string())); + return Err(get_err( + "Failed to encode multibase", + "Unsupported data type".to_string(), + )); } let base = multibase::Base::from_code(code); if let Ok(base) = base { Ok(multibase::encode(base, data_bytes)) } else { - Err(get_err("Failed to encode multibase", base.unwrap_err().to_string())) + Err(get_err( + "Failed to encode multibase", + base.unwrap_err().to_string(), + )) } } From 6c1c10b2e7da0af64ae9df9efa64d532263798ce Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 14:11:22 +0100 Subject: [PATCH 5/8] add error handling; add tests --- pytests/test_decode_car.py | 73 +++++++++++++++++++++++++++- src/lib.rs | 99 +++++++++++++++++++++++++++++++------- 2 files changed, 153 insertions(+), 19 deletions(-) diff --git a/pytests/test_decode_car.py b/pytests/test_decode_car.py index aeb50fb..0657bdb 100644 --- a/pytests/test_decode_car.py +++ b/pytests/test_decode_car.py @@ -5,7 +5,6 @@ from conftest import load_car_fixture - _DID = os.environ.get('CAR_REPO_DID', 'did:plc:w4es6sfh43zlht3bgrzi5qzq') # default is public bot in bsky.app _REPO_CAR_PATH = os.path.join(os.path.dirname(__file__), '..', 'data', 'repo.car') @@ -27,3 +26,75 @@ def test_decode_car(benchmark, car) -> None: assert all(len(k) == 59 for k in blocks.keys()) assert all(isinstance(v, dict) for v in blocks.values()) assert all(v for v in blocks.values()) # not empty dict + + +def test_decode_car_invalid_header_len() -> None: + with pytest.raises(ValueError) as exc_info: + libipld.decode_car(b'') + + assert 'Invalid uvarint' in str(exc_info.value) + + +def test_decode_car_invalid_header() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor('strInsteadOfObj') + libipld.decode_car(header_len + header_obj) + + assert 'Header is not a object' in str(exc_info.value) + + +def test_decode_car_invalid_header_version_key() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'blabla': 'blabla'}) + libipld.decode_car(header_len + header_obj) + + assert 'Version is None' in str(exc_info.value) + + +def test_decode_car_invalid_header_version_value() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'version': 2}) + libipld.decode_car(header_len + header_obj) + + assert 'Version must be 1' in str(exc_info.value) + + +def test_decode_car_invalid_header_roots_key() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'version': 1}) + libipld.decode_car(header_len + header_obj) + + assert 'Roots is None' in str(exc_info.value) + + +def test_decode_car_invalid_header_roots_value_type() -> None: + with pytest.raises(TypeError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'version': 1, 'roots': 123}) + libipld.decode_car(header_len + header_obj) + + assert "cannot be converted to 'PyList'" in str(exc_info.value) + + +def test_decode_car_invalid_header_roots_value_empty_list() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'version': 1, 'roots': []}) + libipld.decode_car(header_len + header_obj) + + assert 'Roots is empty' in str(exc_info.value) + + +def test_decode_car_invalid_block_cid() -> None: + with pytest.raises(ValueError) as exc_info: + header_len = bytes.fromhex('33') # 3 + header_obj = libipld.encode_dag_cbor({'version': 1, 'roots': ['blabla']}) + block1 = bytes.fromhex('33') + b'invalidSid' + + libipld.decode_car(header_len + header_obj + block1) + + assert 'Failed to read CID of block' in str(exc_info.value) diff --git a/src/lib.rs b/src/lib.rs index 6047e3d..c0e09a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,7 @@ use std::io::{BufReader, BufWriter, Cursor, Read, Seek, Write}; use ::libipld::cbor::error::{LengthOutOfRange, NumberOutOfRange, UnknownTag}; use ::libipld::cbor::{cbor, cbor::MajorKind, decode, encode}; -use ::libipld::cid::{Cid, Version, Result as CidResult, Error as CidError}; +use ::libipld::cid::{Cid, Error as CidError, Result as CidResult, Version}; use anyhow::{anyhow, Result}; use byteorder::{BigEndian, ByteOrder}; use multihash::Multihash; @@ -299,9 +299,7 @@ fn read_u64_leb128(r: &mut R) -> Result { loop { let mut buf = [0]; if let Err(_) = r.read_exact(&mut buf) { - return Err(anyhow!( - "Unexpected EOF while reading ULEB128 number." - )); + return Err(anyhow!("Unexpected EOF while reading ULEB128 number.")); } let byte = buf[0] as u64; @@ -341,31 +339,96 @@ fn read_cid_from_bytes(r: &mut R) -> CidResult { pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Bound<'py, PyDict>)> { let buf = &mut BufReader::new(Cursor::new(data)); - let _ = read_u64_leb128(buf); - let header = decode_dag_cbor_to_pyobject(py, buf, 0).unwrap(); + if let Err(_) = read_u64_leb128(buf) { + return Err(get_err( + "Failed to read CAR header", + "Invalid uvarint".to_string(), + )); + } + + let Ok(header_obj) = decode_dag_cbor_to_pyobject(py, buf, 0) else { + return Err(get_err( + "Failed to read CAR header", + "Invalid DAG-CBOR".to_string(), + )); + }; + + if !header_obj.bind(py).is_instance_of::() { + return Err(get_err( + "Failed to read CAR header", + "Header is not a object".to_string(), + )); + } + + let header = header_obj.downcast_bound::(py).unwrap(); + + let Some(version) = header.get_item("version")? else { + return Err(get_err( + "Failed to read CAR header", + "Version is None".to_string(), + )); + }; + + if version.downcast::()?.extract::()? != 1 { + return Err(get_err( + "Failed to read CAR header", + "Unsupported version. Version must be 1".to_string(), + )); + } + + let Some(roots) = header.get_item("roots")? else { + return Err(get_err( + "Failed to read CAR header", + "Roots is None".to_string(), + )); + }; + + let roots = roots.downcast::()?; + if roots.len() == 0 { + return Err(get_err( + "Failed to read CAR header", + "Roots is empty. Must be at least one".to_string(), + )); + } + + // FIXME (MarshalX): we are not verifying if the roots are valid CIDs + let parsed_blocks = PyDict::new_bound(py); loop { if let Err(_) = read_u64_leb128(buf) { + // we are not raising an error here because of possible EOF break; } - let cid = read_cid_from_bytes(buf); - if let Err(_) = cid { - break; - } + let cid_result = read_cid_from_bytes(buf); + let Ok(cid) = cid_result else { + return Err(get_err( + "Failed to read CID of block", + cid_result.unwrap_err().to_string(), + )); + }; - let block = decode_dag_cbor_to_pyobject(py, buf, 0); - if let Ok(block) = block { - parsed_blocks - .set_item(cid.unwrap().to_string().to_object(py), block) - .unwrap(); - } else { - break; + if cid.codec() != 0x71 { + return Err(get_err( + "Failed to read CAR block", + "Unsupported codec. For now we support only DAG-CBOR (0x71)".to_string(), + )); } + + let block_result = decode_dag_cbor_to_pyobject(py, buf, 0); + let Ok(block) = block_result else { + return Err(get_err( + "Failed to read CAR block", + block_result.unwrap_err().to_string(), + )); + }; + + let cid_base = cid.to_string().to_object(py); + parsed_blocks.set_item(cid_base, block).unwrap(); } - Ok((header, parsed_blocks)) + Ok((header_obj, parsed_blocks)) } #[pyfunction] From 184d2f0710fc64068e30e1a34e27a594ecd15f5a Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 14:23:49 +0100 Subject: [PATCH 6/8] simplify checks --- pytests/test_decode_car.py | 6 +++--- src/lib.rs | 12 ++---------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pytests/test_decode_car.py b/pytests/test_decode_car.py index 0657bdb..8744cb9 100644 --- a/pytests/test_decode_car.py +++ b/pytests/test_decode_car.py @@ -35,13 +35,13 @@ def test_decode_car_invalid_header_len() -> None: assert 'Invalid uvarint' in str(exc_info.value) -def test_decode_car_invalid_header() -> None: - with pytest.raises(ValueError) as exc_info: +def test_decode_car_invalid_header_type() -> None: + with pytest.raises(TypeError) as exc_info: header_len = bytes.fromhex('33') # 3 header_obj = libipld.encode_dag_cbor('strInsteadOfObj') libipld.decode_car(header_len + header_obj) - assert 'Header is not a object' in str(exc_info.value) + assert "cannot be converted to 'PyDict'" in str(exc_info.value) def test_decode_car_invalid_header_version_key() -> None: diff --git a/src/lib.rs b/src/lib.rs index c0e09a0..dd06b71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -353,15 +353,7 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun )); }; - if !header_obj.bind(py).is_instance_of::() { - return Err(get_err( - "Failed to read CAR header", - "Header is not a object".to_string(), - )); - } - - let header = header_obj.downcast_bound::(py).unwrap(); - + let header = header_obj.downcast_bound::(py)?; let Some(version) = header.get_item("version")? else { return Err(get_err( "Failed to read CAR header", @@ -397,7 +389,7 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun loop { if let Err(_) = read_u64_leb128(buf) { - // we are not raising an error here because of possible EOF + // FIXME (MarshalX): we are not raising an error here because of possible EOF break; } From eee3182f0b5687f1c3a7fc6b74fac3f51807c85e Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 14:24:57 +0100 Subject: [PATCH 7/8] cleanup --- src/lib.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index dd06b71..bfc3f44 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -345,7 +345,6 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun "Invalid uvarint".to_string(), )); } - let Ok(header_obj) = decode_dag_cbor_to_pyobject(py, buf, 0) else { return Err(get_err( "Failed to read CAR header", @@ -354,13 +353,13 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun }; let header = header_obj.downcast_bound::(py)?; + let Some(version) = header.get_item("version")? else { return Err(get_err( "Failed to read CAR header", "Version is None".to_string(), )); }; - if version.downcast::()?.extract::()? != 1 { return Err(get_err( "Failed to read CAR header", @@ -374,9 +373,7 @@ pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(PyObject, Boun "Roots is None".to_string(), )); }; - - let roots = roots.downcast::()?; - if roots.len() == 0 { + if roots.downcast::()?.len() == 0 { return Err(get_err( "Failed to read CAR header", "Roots is empty. Must be at least one".to_string(), From f0d1449231d0bf7f144698a53ec0cccdf971d58e Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Mon, 11 Mar 2024 14:31:26 +0100 Subject: [PATCH 8/8] fix possible panics --- src/lib.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bfc3f44..b8211f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -315,8 +315,12 @@ fn read_u64_leb128(r: &mut R) -> Result { } fn read_cid_from_bytes(r: &mut R) -> CidResult { - let version = read_u64_leb128(r).unwrap(); - let codec = read_u64_leb128(r).unwrap(); + let Ok(version) = read_u64_leb128(r) else { + return Err(CidError::VarIntDecodeError); + }; + let Ok(codec) = read_u64_leb128(r) else { + return Err(CidError::VarIntDecodeError); + }; if [version, codec] == [0x12, 0x20] { let mut digest = [0u8; 32];