Skip to content

Commit

Permalink
fix:Support hashing ipaddress (facebookincubator#11640)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: facebookincubator#11640

Checksuming on ipaddress fails because the prestoHasher treats ipaddress as a HUGEINT, which it is; however, the hashing for HUGEINT is different than the hashing for binaries.

In Velox, we represent IPAddress as a HUGEINT (16 bytes) instead of varbinary. So we need to compute the checksum on the varbinary representation of the int128_t.

Reviewed By: bikramSingh91

Differential Revision: D66416538

fbshipit-source-id: 6becaae50c9cc09132190b65811f068b4578be48
  • Loading branch information
韦佟 committed Nov 26, 2024
1 parent 224c43f commit b1662c8
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 12 deletions.
12 changes: 12 additions & 0 deletions velox/functions/prestosql/aggregates/PrestoHasher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,17 @@
#include <xxhash.h>

#include "velox/functions/lib/RowsTranslationUtil.h"
#include "velox/functions/prestosql/types/IPAddressType.h"
#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h"

namespace facebook::velox::aggregate {

namespace {

FOLLY_ALWAYS_INLINE uint64_t hashBuffer(const uint8_t* data, size_t size) {
return XXH64(data, size, 0);
}

template <typename T>
FOLLY_ALWAYS_INLINE int64_t hashInteger(const T& value) {
return XXH64_round(0, value);
Expand Down Expand Up @@ -171,7 +176,14 @@ FOLLY_ALWAYS_INLINE void PrestoHasher::hash<TypeKind::HUGEINT>(
const SelectivityVector& rows,
BufferPtr& hashes) {
applyHashFunction(rows, *vector_.get(), hashes, [&](auto row) {
auto type = vector_->base()->type();
auto value = vector_->valueAt<int128_t>(row);

if (isIPAddressType(type)) {
auto byteArray = ipaddress::toIPv6ByteArray(value);
return hashBuffer(byteArray.data(), byteArray.size());
}

// Presto Java UnscaledDecimal128 representation uses signed magnitude
// representation. Only negative values differ in this representation.
// The processing here is mainly for the convenience of hash computation.
Expand Down
19 changes: 19 additions & 0 deletions velox/functions/prestosql/aggregates/tests/PrestoHasherTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <gtest/gtest.h>

#include "velox/functions/prestosql/aggregates/PrestoHasher.h"
#include "velox/functions/prestosql/types/IPAddressType.h"
#include "velox/functions/prestosql/types/TimestampWithTimeZoneType.h"
#include "velox/type/tz/TimeZoneMap.h"
#include "velox/vector/tests/utils/VectorTestBase.h"
Expand Down Expand Up @@ -430,4 +431,22 @@ TEST_F(PrestoHasherTest, timestampWithTimezone) {
{-3461822077982309083, -3461822077982309083, -8497890125589769483, 0});
}

TEST_F(PrestoHasherTest, ipAddress) {
auto makeIpAdressFromString = [](const std::string& ipAddr) -> int128_t {
auto ret = ipaddress::tryGetIPv6asInt128FromString(ipAddr);
return ret.value();
};

auto ipAddresses = makeNullableFlatVector(
std::vector<std::optional<int128_t>>{
makeIpAdressFromString("2001:db8::ff00:42:8329"),
makeIpAdressFromString("192.168.1.1"),
makeIpAdressFromString("::ffff:1.2.3.4"),
std::nullopt},
IPADDRESS());

assertHash(
ipAddresses,
{-4694347089639306204, 2704428192845283049, -1632332718929005309, 0});
}
} // namespace facebook::velox::aggregate::test
23 changes: 11 additions & 12 deletions velox/functions/prestosql/types/IPAddressType.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ constexpr int kIPV4ToV6FFIndex = 10;
constexpr int kIPV4ToV6Index = 12;
constexpr int kIPAddressBytes = 16;

inline folly::ByteArray16 toIPv6ByteArray(const int128_t& ipAddr) {
folly::ByteArray16 bytes{{0}};
memcpy(bytes.data(), &ipAddr, sizeof(ipAddr));
// Reverse because the velox is always on little endian system
// and the byte array needs to be big endian (network byte order)
std::reverse(bytes.begin(), bytes.end());
return bytes;
}

inline folly::Expected<int128_t, folly::IPAddressFormatError>
tryGetIPv6asInt128FromString(const std::string& ipAddressStr) {
auto maybeIp = folly::IPAddress::tryFromString(ipAddressStr);
Expand Down Expand Up @@ -56,8 +65,8 @@ class IPAddressType : public HugeintType {
}

int32_t compare(const int128_t& left, const int128_t& right) const override {
const auto leftAddrBytes = toIPv6ByteArray(left);
const auto rightAddrBytes = toIPv6ByteArray(right);
const auto leftAddrBytes = ipaddress::toIPv6ByteArray(left);
const auto rightAddrBytes = ipaddress::toIPv6ByteArray(right);
return memcmp(
leftAddrBytes.begin(),
rightAddrBytes.begin(),
Expand Down Expand Up @@ -87,16 +96,6 @@ class IPAddressType : public HugeintType {
obj["type"] = name();
return obj;
}

private:
static folly::ByteArray16 toIPv6ByteArray(const int128_t& ipAddr) {
folly::ByteArray16 bytes{{0}};
memcpy(bytes.data(), &ipAddr, sizeof(ipAddr));
// Reverse because the velox is always on little endian system
// and the byte array needs to be big endian (network byte order)
std::reverse(bytes.begin(), bytes.end());
return bytes;
}
};

FOLLY_ALWAYS_INLINE bool isIPAddressType(const TypePtr& type) {
Expand Down

0 comments on commit b1662c8

Please sign in to comment.