Skip to content

Commit

Permalink
Support all cast patterns from varchar to timestamp for spark
Browse files Browse the repository at this point in the history
Fix
  • Loading branch information
liujiayi771 committed Nov 3, 2024
1 parent e5234d1 commit 4b60b3a
Show file tree
Hide file tree
Showing 13 changed files with 476 additions and 31 deletions.
8 changes: 4 additions & 4 deletions velox/expression/PrestoCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,17 @@ Expected<Timestamp> PrestoCastHooks::castStringToTimestamp(
// If the parsed string has timezone information, convert the timestamp at
// GMT at that time. For example, "1970-01-01 00:00:00 -00:01" is 60 seconds
// at GMT.
if (result.second != nullptr) {
result.first.toGMT(*result.second);
if (result.tz != nullptr) {
result.timestamp.toGMT(*result.tz);

}
// If no timezone information is available in the input string, check if we
// should understand it as being at the session timezone, and if so, convert
// to GMT.
else if (options_.timeZone != nullptr) {
result.first.toGMT(*options_.timeZone);
result.timestamp.toGMT(*options_.timeZone);
}
return result.first;
return result.timestamp;
}

Expected<Timestamp> PrestoCastHooks::castIntToTimestamp(int64_t seconds) const {
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -1499,7 +1499,8 @@ struct FromIso8601Timestamp {
return castResult.error();
}

auto [ts, timeZone] = castResult.value();
auto ts = castResult.value().timestamp;
auto timeZone = castResult.value().tz;
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (!timeZone) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ void castFromString(
if (castResult.hasError()) {
context.setStatus(row, castResult.error());
} else {
auto [ts, timeZone] = castResult.value();
auto ts = castResult.value().timestamp;
auto timeZone = castResult.value().tz;
// Input string may not contain a timezone - if so, it is interpreted in
// session timezone.
if (timeZone == nullptr) {
Expand Down
8 changes: 4 additions & 4 deletions velox/functions/sparksql/specialforms/SparkCastExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ exec::ExprPtr SparkCastCallToSpecialForm::constructSpecialForm(
const TypePtr& type,
std::vector<exec::ExprPtr>&& compiledChildren,
bool trackCpuUsage,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
VELOX_CHECK_EQ(
compiledChildren.size(),
1,
Expand All @@ -33,14 +33,14 @@ exec::ExprPtr SparkCastCallToSpecialForm::constructSpecialForm(
std::move(compiledChildren[0]),
trackCpuUsage,
false,
std::make_shared<SparkCastHooks>());
std::make_shared<SparkCastHooks>(config));
}

exec::ExprPtr SparkTryCastCallToSpecialForm::constructSpecialForm(
const TypePtr& type,
std::vector<exec::ExprPtr>&& compiledChildren,
bool trackCpuUsage,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
VELOX_CHECK_EQ(
compiledChildren.size(),
1,
Expand All @@ -51,6 +51,6 @@ exec::ExprPtr SparkTryCastCallToSpecialForm::constructSpecialForm(
std::move(compiledChildren[0]),
trackCpuUsage,
true,
std::make_shared<SparkCastHooks>());
std::make_shared<SparkCastHooks>(config));
}
} // namespace facebook::velox::functions::sparksql
50 changes: 48 additions & 2 deletions velox/functions/sparksql/specialforms/SparkCastHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,60 @@

#include "velox/functions/sparksql/specialforms/SparkCastHooks.h"
#include "velox/functions/lib/string/StringImpl.h"
#include "velox/type/TimestampConversion.h"
#include "velox/type/tz/TimeZoneMap.h"

namespace facebook::velox::functions::sparksql {

SparkCastHooks::SparkCastHooks(const core::QueryConfig& config) : CastHooks() {
const auto sessionTzName = config.sessionTimezone();
if (!sessionTzName.empty()) {
options_.timeZone = tz::locateZone(sessionTzName);
}
}

Expected<Timestamp> SparkCastHooks::castStringToTimestamp(
const StringView& view) const {
return util::fromTimestampString(
// Allows all patterns supported by Spark:
// `[+-]yyyy*`
// `[+-]yyyy*-[m]m`
// `[+-]yyyy*-[m]m-[d]d`
// `[+-]yyyy*-[m]m-[d]d `
// `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
// `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
// `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
// `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
//
// where `zone_id` should have one of the forms:
// 1. Z - Zulu time zone UTC+0
// 2. +|-[h]h:[m]m
// 3. A short id, see
// https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
// 4. An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
// and a suffix in the following formats:
// a. +|-h[h]
// b. +|-hh[:]mm
// c. +|-hh:mm:ss
// d. +|-hhmmss
// 5. Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
const auto conversionResult = util::fromTimestampWithTimezoneString(
view.data(), view.size(), util::TimestampParseMode::kSparkCast);
if (conversionResult.hasError()) {
return folly::makeUnexpected(conversionResult.error());
}

auto result = conversionResult.value();

if (result.tz != nullptr) {
// If the parsed string has timezone information, convert the timestamp at
// GMT at that time.
result.timestamp.toGMT(*result.tz, result.secondsOffset);
} else if (options_.timeZone != nullptr) {
// If the input string contains no timezone information, determine whether
// it should be interpreted as being in the session timezone and, if so,
// convert it to GMT.
result.timestamp.toGMT(*options_.timeZone);
}
return result.timestamp;
}

Expected<Timestamp> SparkCastHooks::castIntToTimestamp(int64_t seconds) const {
Expand Down
6 changes: 6 additions & 0 deletions velox/functions/sparksql/specialforms/SparkCastHooks.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
#pragma once

#include "velox/expression/CastHooks.h"
#include "velox/expression/EvalCtx.h"

namespace facebook::velox::functions::sparksql {

// This class provides cast hooks following Spark semantics.
class SparkCastHooks : public exec::CastHooks {
public:
explicit SparkCastHooks(const velox::core::QueryConfig& config);

// TODO: Spark hook allows more string patterns than Presto.
Expected<Timestamp> castStringToTimestamp(
const StringView& view) const override;
Expand Down Expand Up @@ -59,5 +62,8 @@ class SparkCastHooks : public exec::CastHooks {
}

exec::PolicyType getPolicy() const override;

private:
TimestampToStringOptions options_ = {};
};
} // namespace facebook::velox::functions::sparksql
78 changes: 78 additions & 0 deletions velox/functions/sparksql/tests/SparkCastExprTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ TEST_F(SparkCastExprTest, invalidDate) {

TEST_F(SparkCastExprTest, stringToTimestamp) {
std::vector<std::optional<std::string>> input{
"2015",
"-2015",
"2015-03",
"-2015-03",
"1970-01-01",
"2000-01-01",
"1970-01-01 00:00:00",
Expand All @@ -241,12 +245,37 @@ TEST_F(SparkCastExprTest, stringToTimestamp) {
"2015-03-18T12:03:17",
"2015-03-18 12:03:17",
"2015-03-18T12:03:17",
"2015-03-18T12:03:17Z",
"2015-03-18 12:03:17Z",
"2015-03-18 12:03:17.123",
"2015-03-18T12:03:17.123",
"2015-03-18T12:03:17.456",
"2015-03-18 12:03:17.456",
"2015-03-18T12:03:17.456Z",
"2015-03-18 12:03:17.456Z",
"2015-03-18T12:03:17-1:0",
"2015-03-18T12:03:17-01:00",
"2015-03-18T12:03:17+07:30",
"2015-03-18T12:03:17+7:3",
"2015-03-18T12:03:17.123-1:0",
"2015-03-18T12:03:17.123-01:00",
"2015-03-18T12:03:17.123+07:30",
"2015-03-18T12:03:17.123+7:3",
"2015-03-18 12:03:17.123UTC+8",
"2015-03-18 12:03:17.123UTC+8:1",
"2015-03-18T12:03:17.123GMT+081010",
"2015-03-18T12:03:17.123GMT+8:10:10",
"2015-03-18T12:03:17.123GMT+8:10",
"2015-03-18T12:03:17.123UT+00:00:10",
"2015-03-18 12:03:17.123456789",
"2015-03-18 12:03:17.123Etc/GMT+1",
"2015-03-18 12:03:17.123CTT",
};
std::vector<std::optional<Timestamp>> expected{
Timestamp(1420070400, 0),
Timestamp(-125754422400, 0),
Timestamp(1425168000, 0),
Timestamp(-125749324800, 0),
Timestamp(0, 0),
Timestamp(946684800, 0),
Timestamp(0, 0),
Expand All @@ -255,14 +284,63 @@ TEST_F(SparkCastExprTest, stringToTimestamp) {
Timestamp(1426680197, 0),
Timestamp(1426680197, 0),
Timestamp(1426680197, 0),
Timestamp(1426680197, 0),
Timestamp(1426680197, 0),
Timestamp(1426680197, 123000000),
Timestamp(1426680197, 123000000),
Timestamp(1426680197, 456000000),
Timestamp(1426680197, 456000000),
Timestamp(1426680197, 456000000),
Timestamp(1426680197, 456000000),
Timestamp(1426680197 + 1 * 60 * 60, 0),
Timestamp(1426680197 + 1 * 60 * 60, 0),
Timestamp(1426680197 - (7 * 60 * 60 + 30 * 60), 0),
Timestamp(1426680197 - (7 * 60 * 60 + 3 * 60), 0),
Timestamp(1426680197 + 1 * 60 * 60, 123000000),
Timestamp(1426680197 + 1 * 60 * 60, 123000000),
Timestamp(1426680197 - (7 * 60 * 60 + 30 * 60), 123000000),
Timestamp(1426680197 - (7 * 60 * 60 + 3 * 60), 123000000),
Timestamp(1426680197 - 8 * 60 * 60, 123000000),
Timestamp(1426680197 - (8 * 60 * 60 + 1 * 60), 123000000),
Timestamp(1426680197 - (8 * 60 * 60 + 10 * 60 + 10), 123000000),
Timestamp(1426680197 - (8 * 60 * 60 + 10 * 60 + 10), 123000000),
Timestamp(1426680197 - (8 * 60 * 60 + 10 * 60), 123000000),
Timestamp(1426680197 - 10, 123000000),
Timestamp(1426680197, 123456000),
// Etc/GMT+1 and GMT-1 are equivalent.
Timestamp(1426680197 + 1 * 60 * 60, 123000000),
Timestamp(1426680197 - 8 * 60 * 60, 123000000),
};
testCast<std::string, Timestamp>("timestamp", input, expected);
}

TEST_F(SparkCastExprTest, invalidStringToTimestamp) {
testInvalidCast<StringView>(
"timestamp",
{"2015-"},
"Cannot cast VARCHAR '2015-' to TIMESTAMP. Unable to parse timestamp value: \"2015-\"");
testInvalidCast<StringView>(
"timestamp",
{"2015-13"},
"Cannot cast VARCHAR '2015-13' to TIMESTAMP. Unable to parse timestamp value: \"2015-13\"");
testInvalidCast<StringView>(
"timestamp",
{"2015-03-18 12:03:17.123utc+080000"},
"Unknown timezone value: \"utc+080000\"");
testInvalidCast<StringView>(
"timestamp",
{"2015-03-18 12:03:17.123UTC+08000012"},
"Failed to normalize spark timezone value: \"UTC+08000012\"");
testInvalidCast<StringView>(
"timestamp",
{"2015-03-18 12:03:17.123UTC+080061"},
"Failed to normalize spark timezone value: \"UTC+080061\"");
testInvalidCast<StringView>(
"timestamp",
{"2015-03-18 12:03:17.123UTC+8:6:10"},
"Failed to normalize spark timezone value: \"UTC+8:6:10\"");
}

TEST_F(SparkCastExprTest, intToTimestamp) {
// Cast bigint as timestamp.
testCast(
Expand Down
13 changes: 13 additions & 0 deletions velox/type/Timestamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ void Timestamp::toGMT(const tz::TimeZone& zone) {
seconds_ = sysSeconds.count();
}

void Timestamp::toGMT(const tz::TimeZone& zone, int32_t secondsOffset) {
toGMT(zone);
if (seconds_ + secondsOffset < kMinSeconds ||
seconds_ + secondsOffset > kMaxSeconds) {
VELOX_USER_FAIL(
"The seconds offset in timezone will get invalid timestamp, "
"timestamp is {}, seconds offset is {}",
toString(),
secondsOffset);
}
seconds_ += secondsOffset;
}

std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>
Timestamp::toTimePointMs(bool allowOverflow) const {
using namespace std::chrono;
Expand Down
3 changes: 3 additions & 0 deletions velox/type/Timestamp.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ struct Timestamp {
// ts.toString(); // returns January 1, 1970 08:00:00
void toGMT(const tz::TimeZone& zone);

// Converts the timestamp to the GMT time and add the seconds offset.
void toGMT(const tz::TimeZone& zone, int32_t secondsOffset);

/// Assuming the timestamp represents a GMT time, converts it to the time at
/// the same moment at zone. For example:
///
Expand Down
Loading

0 comments on commit 4b60b3a

Please sign in to comment.