Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 20 additions & 11 deletions src/paimon/format/orc/complex_predicate_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,6 @@ class ComplexPredicateTest : public ::testing::Test {
void SetUp() override {
pool_ = GetDefaultPool();
batch_size_ = 10;

arrow::FieldVector fields = {
arrow::field("f1", arrow::int32()),
arrow::field("f2", arrow::int32()),
arrow::field("f3", arrow::date32()),
arrow::field("f4", arrow::timestamp(arrow::TimeUnit::NANO)),
arrow::field("f5", arrow::decimal128(23, 5)),
};
}
void TearDown() override {}

Expand Down Expand Up @@ -131,16 +123,33 @@ TEST_F(ComplexPredicateTest, TestSimple) {
arrow::field("f5", arrow::decimal128(23, 5)),
};
auto read_schema = arrow::schema(fields);
auto expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([
std::shared_ptr<arrow::StructArray> expected_array;
if (::paimon::test::OsReleaseDetector::IsDebian()) {
// refer: https://github.com/eggert/tz/blob/main/asia#L653
// When using the Asia/Shanghai timezone under Debian, timestamps prior to 1901 have an
// additional offset of 5 minutes and 43 seconds
expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([
[10, 1, 1234, "2033-05-18 03:33:20.0", "123456789987654321.45678"],
[10, 1, 19909, "2033-05-18 03:33:20.000001001", "12.30000"],
[10, 1, 0, "2008-12-28 00:00:00.000123456", null],
[10, 1, 100, "2008-12-28 00:00:00.00012345", "-123.45000"],
[10, 1, null, "1899-01-01 01:05:03.001001001", "0.00000"],
[10, 1, 20006, "2024-10-10 10:10:10.100100100", "1728551410100.10010"]
])")
.ValueOrDie());
} else {
expected_array = std::dynamic_pointer_cast<arrow::StructArray>(
arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([
[10, 1, 1234, "2033-05-18 03:33:20.0", "123456789987654321.45678"],
[10, 1, 19909, "2033-05-18 03:33:20.000001001", "12.30000"],
[10, 1, 0, "2008-12-28 00:00:00.000123456", null],
[10, 1, 100, "2008-12-28 00:00:00.00012345", "-123.45000"],
[10, 1, null, "1899-01-01 00:59:20.001001001", "0.00000"],
[10, 1, 20006, "2024-10-10 10:10:10.100100100", "1728551410100.10010"]
])")
.ValueOrDie());
.ValueOrDie());
}

// date
{
Expand Down
2 changes: 2 additions & 0 deletions src/paimon/format/orc/orc_file_batch_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ Result<::orc::RowReaderOptions> OrcFileBatchReader::CreateRowReaderOptions(
}
row_reader_options.include(include_fields);
row_reader_options.searchArgument(std::move(search_arg));
// refer: https://github.com/apache/arrow/pull/34591
row_reader_options.setTimezoneName("GMT");
Comment on lines +245 to +246
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While setting the reader timezone to GMT fixes the reading issue, the ORC writer should also be configured to write timestamps with GMT timezone to ensure consistency. Based on the PR description, this is the proper long-term solution. Consider adding writer_options.setTimezoneName("GMT"); in the PrepareWriterOptions method (around line 230 in orc_format_writer.cpp) to ensure both reading and writing use the same timezone, which would eliminate the need for OS-specific test expectations.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, C++ Paimon currently does not support user-configurable readerTimezone. When it is not configured, the default setting is already GMT, so there is no need to add this line here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you very much. Your analysis is correct,But I believe we should explicitly set the timezone – this serves as a marker of best practice and also makes it easier for other maintainers to understand the code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. I'll explicitly set timezone in reader&writer later


PAIMON_ASSIGN_OR_RAISE(
bool enable_lazy_decoding,
Expand Down
20 changes: 18 additions & 2 deletions src/paimon/format/orc/orc_file_batch_reader_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,15 +499,31 @@ TEST_P(OrcFileBatchReaderTest, TestComplexType) {
ASSERT_OK_AND_ASSIGN(auto result_array,
paimon::test::ReadResultCollector::CollectResult(orc_batch_reader.get()));
std::shared_ptr<arrow::ChunkedArray> expected_array;
auto array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
arrow::Status array_status;
if (::paimon::test::OsReleaseDetector::IsDebian()) {
// refer: https://github.com/eggert/tz/blob/main/asia#L653
// When using the Asia/Shanghai timezone under Debian, timestamps prior to 1901 have an
// additional offset of 5 minutes and 43 seconds
array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
[10, 1, 1234, "2033-05-18 03:33:20.0", "123456789987654321.45678", "add"],
[10, 1, 19909, "2033-05-18 03:33:20.000001001", "12.30000", "cat"],
[10, 1, 0, "2008-12-28 00:00:00.000123456", null, "dad"],
[10, 1, 100, "2008-12-28 00:00:00.00012345", "-123.45000", "eat"],
[10, 1, null, "1899-01-01 01:05:03.001001001", "0.00000", "fat"],
[10, 1, 20006, "2024-10-10 10:10:10.1001001", "1728551410100.10010", null]
])"},
&expected_array);
} else {
array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
[10, 1, 1234, "2033-05-18 03:33:20.0", "123456789987654321.45678", "add"],
[10, 1, 19909, "2033-05-18 03:33:20.000001001", "12.30000", "cat"],
[10, 1, 0, "2008-12-28 00:00:00.000123456", null, "dad"],
[10, 1, 100, "2008-12-28 00:00:00.00012345", "-123.45000", "eat"],
[10, 1, null, "1899-01-01 00:59:20.001001001", "0.00000", "fat"],
[10, 1, 20006, "2024-10-10 10:10:10.1001001", "1728551410100.10010", null]
])"},
&expected_array);
&expected_array);
}
ASSERT_TRUE(array_status.ok());
ASSERT_TRUE(result_array->Equals(*expected_array));
}
Expand Down
21 changes: 21 additions & 0 deletions src/paimon/testing/utils/testharness.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#pragma once

#include <filesystem>
#include <fstream>
#include <map>
#include <memory>
#include <string>
Expand All @@ -69,6 +70,26 @@ std::string GetJindoTestDir();

int64_t RandomNumber(int64_t min, int64_t max);

class OsReleaseDetector {
public:
static bool IsDebian() {
std::ifstream file("/etc/os-release");
if (!file.is_open()) {
return false;
}

std::string line;
while (std::getline(file, line)) {
if (line.find("ID=") == 0) {
if (line.find("debian") != std::string::npos) {
return true;
}
}
}
return false;
}
};

::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s);

#define ASSERT_OK(expr) \
Expand Down
17 changes: 15 additions & 2 deletions test/inte/read_inte_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,13 +659,26 @@ TEST_P(ReadInteTest, TestAppendReadWithComplexTypePredicate) {
DataField::ConvertDataFieldsToArrowStructType(fields_with_row_kind);

std::shared_ptr<arrow::ChunkedArray> expected_array;
auto array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
arrow::Status array_status;
if (::paimon::test::OsReleaseDetector::IsDebian() && param.file_format == "orc") {
// refer: https://github.com/eggert/tz/blob/main/asia#L653
// When using the Asia/Shanghai timezone under Debian, timestamps prior to 1901 have an
// additional offset of 5 minutes and 43 seconds
array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
[0, "add", 1, "2033-05-18 03:33:20.0", 1234, "123456789987654321.45678"],
[0, "cat", 1, "2033-05-18 03:33:20.000001001", 19909, "12.30000"],
[0, "fat", 1, "1899-01-01 01:05:03.001001001", null, "0.00000"]
])"},
&expected_array);
} else {
array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([
[0, "add", 1, "2033-05-18 03:33:20.0", 1234, "123456789987654321.45678"],
[0, "cat", 1, "2033-05-18 03:33:20.000001001", 19909, "12.30000"],
[0, "fat", 1, "1899-01-01 00:59:20.001001001", null, "0.00000"],
[0, "bad", 1, "1899-01-01 00:59:20.001001001", -1234, "-123456789987654321.45678"]
])"},
&expected_array);
&expected_array);
}
ASSERT_TRUE(array_status.ok());
ASSERT_TRUE(result_array);
ASSERT_TRUE(result_array->Equals(*expected_array));
Expand Down
Loading