Skip to content

ut: add serialize/deserialize tests for spec #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 141 additions & 23 deletions crates/paimon/src/spec/data_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use crate::spec::RowType;
use crate::spec::{BinaryTableStats, RowType};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::fmt::{Display, Formatter};
Expand Down Expand Up @@ -48,24 +48,6 @@ impl BinaryRow {
}
}

/// TODO: implement me.
/// The statistics for columns, supports the following stats.
///
/// Impl References: <https://github.com/apache/paimon/blob/release-0.8.2/paimon-core/src/main/java/org/apache/paimon/stats/SimpleStats.java>
type SimpleStats = ();

/// The Source of a file.
/// TODO: move me to the manifest module.
///
/// Impl References: <https://github.com/apache/paimon/blob/release-0.8.2/paimon-core/src/main/java/org/apache/paimon/manifest/FileSource.java>
#[repr(u8)]
#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum FileSource {
Append = 0,
Compact = 1,
}

/// Metadata of a data file.
///
/// Impl References: <https://github.com/apache/paimon/blob/release-0.8.2/paimon-core/src/main/java/org/apache/paimon/io/DataFileMeta.java>
Expand All @@ -78,8 +60,8 @@ pub struct DataFileMeta {
pub row_count: i64,
pub min_key: BinaryRow,
pub max_key: BinaryRow,
pub key_stats: SimpleStats,
pub value_stats: SimpleStats,
pub key_stats: Option<BinaryTableStats>,
pub value_stats: Option<BinaryTableStats>,
pub min_sequence_number: i64,
pub max_sequence_number: i64,
pub schema_id: i64,
Expand All @@ -90,7 +72,6 @@ pub struct DataFileMeta {
pub delete_row_count: Option<i64>,
// file index filter bytes, if it is small, store in data file meta
pub embedded_index: Option<Vec<u8>>,
pub file_source: Option<FileSource>,
}

impl Display for DataFileMeta {
Expand All @@ -100,6 +81,143 @@ impl Display for DataFileMeta {
}

impl DataFileMeta {
// TODO: implement me
pub const SCHEMA: RowType = RowType::new(vec![]);

/// Get the file name.
pub fn file_name(&self) -> &str {
&self.file_name
}

/// Get the file size.
pub fn file_size(&self) -> i64 {
self.file_size
}

/// Get the row count.
pub fn row_count(&self) -> i64 {
self.row_count
}

/// Get the min key.
pub fn min_key(&self) -> &BinaryRow {
&self.min_key
}

/// Get the max key.
pub fn max_key(&self) -> &BinaryRow {
&self.max_key
}

/// Get the key stats.
pub fn key_stats(&self) -> Option<&BinaryTableStats> {
self.key_stats.as_ref()
}

/// Get the value stats.
pub fn value_stats(&self) -> Option<&BinaryTableStats> {
self.value_stats.as_ref()
}

/// Get the min sequence number.
pub fn min_sequence_number(&self) -> i64 {
self.min_sequence_number
}

/// Get the max sequence number.
pub fn max_sequence_number(&self) -> i64 {
self.max_sequence_number
}

/// Get the schema id.
pub fn schema_id(&self) -> i64 {
self.schema_id
}

/// Get the level.
pub fn level(&self) -> i32 {
self.level
}

/// Get the extra files.
pub fn extra_files(&self) -> &[String] {
&self.extra_files
}

/// Get the creation time.
pub fn creation_time(&self) -> DateTime<Utc> {
self.creation_time
}

/// Get the delete row count.
pub fn delete_row_count(&self) -> Option<i64> {
self.delete_row_count
}

/// Get the embedded index.
pub fn embedded_index(&self) -> Option<&[u8]> {
self.embedded_index.as_deref()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_data_file_meta_serialize_deserialize() {
let json_data = r#"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @devillove084

https://github.com/apache/paimon-rust/blob/main/crates/paimon/tests/fixtures/manifest_file_meta_schema.json

This fixture appears to be just the schema of ManifestFileMeta. How is ManifestFileMeta data stored on disk?

Copy link
Member

@Xuanwo Xuanwo Aug 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also have DataFileMeta's datatype and DataFileMeta on disk format.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default manifest file is of avro format. I think we should also generate a corresponding manifest file to verify the deserialize. But may be we should support avro/orc reader first ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default manifest file is of avro format. I think we should also generate a corresponding manifest file to verify the deserialize. But may be we should support avro/orc reader first ?

+1

{
"fileName":"test.avro",
"fileSize":1024,
"rowCount":100,
"minKey":{
"arity":1,
"nullBitsSizeInBytes":1
},
"maxKey":{
"arity":10,
"nullBitsSizeInBytes":2
},
"keyStats":null,
"valueStats":null,
"minSequenceNumber":0,
"maxSequenceNumber":100,
"schemaId":0,
"level":0,
"extraFiles":[],
"creationTime":"2024-08-13T02:03:03.106490600Z",
"deleteRowCount":5,
"embeddedIndex":null
}
"#;

let data_file_meta: DataFileMeta =
serde_json::from_str(json_data).expect("Failed to deserialize DataFileMeta");

assert_eq!(data_file_meta.file_name, "test.avro");
assert_eq!(data_file_meta.file_size, 1024);
assert_eq!(data_file_meta.row_count, 100);

assert_eq!(data_file_meta.min_key.arity, 1);
assert_eq!(data_file_meta.min_key.null_bits_size_in_bytes, 1);
assert_eq!(data_file_meta.max_key.arity, 10);
assert_eq!(data_file_meta.max_key.null_bits_size_in_bytes, 2);

assert!(data_file_meta.key_stats.is_none());
assert!(data_file_meta.value_stats.is_none());

assert_eq!(data_file_meta.min_sequence_number, 0);
assert_eq!(data_file_meta.max_sequence_number, 100);
assert_eq!(data_file_meta.schema_id, 0);
assert_eq!(data_file_meta.level, 0);
assert_eq!(data_file_meta.extra_files.len(), 0);
assert_eq!(
data_file_meta.creation_time,
DateTime::parse_from_rfc3339("2024-08-13T02:03:03.106490600Z")
.unwrap()
.with_timezone(&Utc)
);
assert_eq!(data_file_meta.delete_row_count, Some(5));
assert!(data_file_meta.embedded_index.is_none());
}
}
42 changes: 42 additions & 0 deletions crates/paimon/src/spec/manifest_file_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,45 @@ impl Display for BinaryTableStats {
todo!()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_manifest_file_meta_serialize_deserialize() {
let data_json = r#"
{
"_VERSION":2,
"_FILE_NAME":"manifest_file_meta.avro",
"_FILE_SIZE":1024,
"_NUM_ADDED_FILES":5,
"_NUM_DELETED_FILES":6,
"_PARTITION_STATS":{"_MIN_VALUES":[0,1,2],"_MAX_VALUES":[3,4,5],"_NULL_COUNTS":[6,7,8]},
"_SCHEMA_ID":1
}
"#;

let manifest_file_meta: ManifestFileMeta =
serde_json::from_str(data_json).expect("Failed to deserialize ManifestFileMeta.");

assert_eq!(manifest_file_meta.file_name(), "manifest_file_meta.avro");
assert_eq!(manifest_file_meta.file_size(), 1024);
assert_eq!(manifest_file_meta.num_added_files(), 5);
assert_eq!(manifest_file_meta.num_deleted_files(), 6);
assert_eq!(manifest_file_meta.schema_id(), 1);
assert_eq!(manifest_file_meta.version(), 2);
assert_eq!(
manifest_file_meta.partition_stats().min_values(),
&[0, 1, 2]
);
assert_eq!(
manifest_file_meta.partition_stats().max_values(),
&[3, 4, 5]
);
assert_eq!(
manifest_file_meta.partition_stats().null_counts(),
&[6, 7, 8]
);
}
}
119 changes: 59 additions & 60 deletions crates/paimon/src/spec/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,65 +109,64 @@ mod tests {
use super::*;

#[test]
fn test_create_data_field() {
let id = 1;
let name = "field1".to_string();
let typ = DataType::Int(IntType::new());
let description = "test description".to_string();

let data_field = DataField::new(id, name.clone(), typ.clone())
.with_description(Some(description.clone()));

assert_eq!(data_field.id(), id);
assert_eq!(data_field.name(), name);
assert_eq!(data_field.data_type(), &typ);
assert_eq!(data_field.description(), Some(description).as_deref());
}

#[test]
fn test_new_id() {
let d_type = DataType::Int(IntType::new());
let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone()).with_id(2);

assert_eq!(new_data_field.id(), 2);
assert_eq!(new_data_field.name(), "field1");
assert_eq!(new_data_field.data_type(), &d_type);
assert_eq!(new_data_field.description(), None);
}

#[test]
fn test_new_name() {
let d_type = DataType::Int(IntType::new());
let new_data_field =
DataField::new(1, "field1".to_string(), d_type.clone()).with_name("field2".to_string());

assert_eq!(new_data_field.id(), 1);
assert_eq!(new_data_field.name(), "field2");
assert_eq!(new_data_field.data_type(), &d_type);
assert_eq!(new_data_field.description(), None);
}

#[test]
fn test_new_description() {
let d_type = DataType::Int(IntType::new());
let new_data_field = DataField::new(1, "field1".to_string(), d_type.clone())
.with_description(Some("new description".to_string()));

assert_eq!(new_data_field.id(), 1);
assert_eq!(new_data_field.name(), "field1");
assert_eq!(new_data_field.data_type(), &d_type);
assert_eq!(new_data_field.description(), Some("new description"));
}

#[test]
fn test_escape_identifier() {
let escaped_identifier = escape_identifier("\"identifier\"");
assert_eq!(escaped_identifier, "\"\"identifier\"\"");
}

#[test]
fn test_escape_single_quotes() {
let escaped_text = escape_single_quotes("text with 'single' quotes");
assert_eq!(escaped_text, "text with ''single'' quotes");
fn test_table_schema_serialize_deserialize() {
let json_data = r#"
{
"version" : 2,
"id" : 1,
"fields" : [ {
"id" : 0,
"name" : "f0",
"type" : "INT"
}, {
"id" : 1,
"name" : "f1",
"type" : "INT"
}, {
"id" : 2,
"name" : "f2",
"type" : "INT"
} ],
"highestFieldId" : 10,
"partitionKeys" : [ "f0" ],
"primaryKeys" : [ "f1" ],
"options" : { },
"comment" : "",
"timeMillis" : 1723440320019
}"#;

let table_schema: TableSchema =
serde_json::from_str(json_data).expect("Failed to deserialize TableSchema");

assert_eq!(table_schema.version, 2);
assert_eq!(table_schema.id, 1);
assert_eq!(table_schema.highest_field_id, 10);
assert_eq!(table_schema.partition_keys, vec!["f0"]);
assert_eq!(table_schema.primary_keys, vec!["f1"]);
assert_eq!(table_schema.options, HashMap::new());
assert_eq!(table_schema.comment, Some("".to_string()));
assert_eq!(table_schema.time_millis, 1723440320019);

assert_eq!(table_schema.fields.len(), 3);
assert_eq!(table_schema.fields[0].id, 0);
assert_eq!(table_schema.fields[0].name, "f0");
assert_eq!(
table_schema.fields[0].data_type(),
&DataType::Int(IntType::new())
);

assert_eq!(table_schema.fields[1].id, 1);
assert_eq!(table_schema.fields[1].name, "f1");
assert_eq!(
table_schema.fields[1].data_type(),
&DataType::Int(IntType::new())
);

assert_eq!(table_schema.fields[2].id, 2);
assert_eq!(table_schema.fields[2].name, "f2");
assert_eq!(
table_schema.fields[2].data_type(),
&DataType::Int(IntType::new())
);
}
}
Loading