Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add partitioning scheme for unresolved shuffle and shuffle reader exec #1144

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ballista/core/proto/ballista.proto
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,15 @@ message ShuffleWriterExecNode {
message UnresolvedShuffleExecNode {
uint32 stage_id = 1;
datafusion_common.Schema schema = 2;
uint32 output_partition_count = 4;
datafusion.Partitioning partitioning = 5;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just one small note, is there any reason we can't use 3 instead of 5

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought there was a field with field number 3 in the past that got deleted, so putting 3 for the new field would violate back compatibility. I don't think we need to preserve back compat here though, not sure if there are users that do rolling updates to their ballista scheduler and executors, but wanted to be safe.

https://protobuf.dev/programming-guides/proto3/#consequences

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would assume that its not used much

}

message ShuffleReaderExecNode {
repeated ShuffleReaderPartition partition = 1;
datafusion_common.Schema schema = 2;
// The stage to read from
uint32 stage_id = 3;
datafusion.Partitioning partitioning = 4;
}

message ShuffleReaderPartition {
Expand Down
7 changes: 4 additions & 3 deletions ballista/core/src/execution_plans/shuffle_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,11 @@ impl ShuffleReaderExec {
stage_id: usize,
partition: Vec<Vec<PartitionLocation>>,
schema: SchemaRef,
partitioning: Partitioning,
) -> Result<Self> {
let properties = PlanProperties::new(
datafusion::physical_expr::EquivalenceProperties::new(schema.clone()),
// TODO partitioning may be known and could be populated here
// see https://github.com/apache/arrow-datafusion/issues/758
Partitioning::UnknownPartitioning(partition.len()),
partitioning,
datafusion::physical_plan::ExecutionMode::Bounded,
);
Ok(Self {
Expand Down Expand Up @@ -134,6 +133,7 @@ impl ExecutionPlan for ShuffleReaderExec {
self.stage_id,
self.partition.clone(),
self.schema.clone(),
self.properties().output_partitioning().clone(),
)?))
}

Expand Down Expand Up @@ -553,6 +553,7 @@ mod tests {
input_stage_id,
vec![partitions],
Arc::new(schema),
Partitioning::UnknownPartitioning(4),
)?;
let mut stream = shuffle_reader_exec.execute(0, task_ctx)?;
let batches = utils::collect_stream(&mut stream).await;
Expand Down
18 changes: 8 additions & 10 deletions ballista/core/src/execution_plans/unresolved_shuffle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,16 @@ pub struct UnresolvedShuffleExec {

impl UnresolvedShuffleExec {
/// Create a new UnresolvedShuffleExec
pub fn new(
stage_id: usize,
schema: SchemaRef,
output_partition_count: usize,
) -> Self {
pub fn new(stage_id: usize, schema: SchemaRef, partitioning: Partitioning) -> Self {
let properties = PlanProperties::new(
datafusion::physical_expr::EquivalenceProperties::new(schema.clone()),
// TODO the output partition is known and should be populated here!
// see https://github.com/apache/arrow-datafusion/issues/758
Partitioning::UnknownPartitioning(output_partition_count),
partitioning,
datafusion::physical_plan::ExecutionMode::Bounded,
);
Self {
stage_id,
schema,
output_partition_count,
output_partition_count: properties.partitioning.partition_count(),
properties,
}
}
Expand All @@ -75,7 +69,11 @@ impl DisplayAs for UnresolvedShuffleExec {
) -> std::fmt::Result {
match t {
DisplayFormatType::Default | DisplayFormatType::Verbose => {
write!(f, "UnresolvedShuffleExec")
write!(
f,
"UnresolvedShuffleExec: {:?}",
self.properties().output_partitioning()
)
}
}
}
Expand Down
6 changes: 4 additions & 2 deletions ballista/core/src/serde/generated/ballista.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ pub struct UnresolvedShuffleExecNode {
pub stage_id: u32,
#[prost(message, optional, tag = "2")]
pub schema: ::core::option::Option<::datafusion_proto_common::Schema>,
#[prost(uint32, tag = "4")]
pub output_partition_count: u32,
#[prost(message, optional, tag = "5")]
pub partitioning: ::core::option::Option<::datafusion_proto::protobuf::Partitioning>,
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ShuffleReaderExecNode {
Expand All @@ -54,6 +54,8 @@ pub struct ShuffleReaderExecNode {
/// The stage to read from
#[prost(uint32, tag = "3")]
pub stage_id: u32,
#[prost(message, optional, tag = "4")]
pub partitioning: ::core::option::Option<::datafusion_proto::protobuf::Partitioning>,
}
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ShuffleReaderPartition {
Expand Down
50 changes: 44 additions & 6 deletions ballista/core/src/serde/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
use crate::{error::BallistaError, serde::scheduler::Action as BallistaAction};

use arrow_flight::sql::ProstMessageExt;
use datafusion::arrow::datatypes::SchemaRef;
use datafusion::common::{DataFusionError, Result};
use datafusion::execution::FunctionRegistry;
use datafusion::physical_plan::{ExecutionPlan, Partitioning};
Expand All @@ -29,6 +30,8 @@ use datafusion_proto::logical_plan::file_formats::{
JsonLogicalExtensionCodec, ParquetLogicalExtensionCodec,
};
use datafusion_proto::physical_plan::from_proto::parse_protobuf_hash_partitioning;
use datafusion_proto::physical_plan::from_proto::parse_protobuf_partitioning;
use datafusion_proto::physical_plan::to_proto::serialize_partitioning;
use datafusion_proto::protobuf::proto_error;
use datafusion_proto::protobuf::{LogicalPlanNode, PhysicalPlanNode};
use datafusion_proto::{
Expand Down Expand Up @@ -291,8 +294,11 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec {
)?))
}
PhysicalPlanType::ShuffleReader(shuffle_reader) => {
let default_codec =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it make sense to make default_codec property BallistaPhysicalExtensionCodec, so it can be overridden if needed? BallistaLogicalExtensionCodec does similar thing. (also there is few other places where DefaultPhysicalCodec is created)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point, thanks!

datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec {};
let stage_id = shuffle_reader.stage_id as usize;
let schema = Arc::new(convert_required!(shuffle_reader.schema)?);
let schema: SchemaRef =
Arc::new(convert_required!(shuffle_reader.schema)?);
let partition_location: Vec<Vec<PartitionLocation>> = shuffle_reader
.partition
.iter()
Expand All @@ -309,16 +315,39 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec {
.collect::<Result<Vec<_>, _>>()
})
.collect::<Result<Vec<_>, DataFusionError>>()?;
let shuffle_reader =
ShuffleReaderExec::try_new(stage_id, partition_location, schema)?;
let partitioning = parse_protobuf_partitioning(
shuffle_reader.partitioning.as_ref(),
registry,
schema.as_ref(),
&default_codec,
)?;
let partitioning = partitioning
.ok_or_else(|| proto_error("missing required partitioning field"))?;
let shuffle_reader = ShuffleReaderExec::try_new(
stage_id,
partition_location,
schema,
partitioning,
)?;
Ok(Arc::new(shuffle_reader))
}
PhysicalPlanType::UnresolvedShuffle(unresolved_shuffle) => {
let schema = Arc::new(convert_required!(unresolved_shuffle.schema)?);
let default_codec =
datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec {};
let schema: SchemaRef =
Arc::new(convert_required!(unresolved_shuffle.schema)?);
let partitioning = parse_protobuf_partitioning(
unresolved_shuffle.partitioning.as_ref(),
registry,
schema.as_ref(),
&default_codec,
)?;
let partitioning = partitioning
.ok_or_else(|| proto_error("missing required partitioning field"))?;
Ok(Arc::new(UnresolvedShuffleExec::new(
unresolved_shuffle.stage_id as usize,
schema,
unresolved_shuffle.output_partition_count as usize,
partitioning,
)))
}
}
Expand Down Expand Up @@ -387,12 +416,17 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec {
.collect::<Result<Vec<_>, _>>()?,
});
}
let default_codec =
datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec {};
let partitioning =
serialize_partitioning(&exec.properties().partitioning, &default_codec)?;
let proto = protobuf::BallistaPhysicalPlanNode {
physical_plan_type: Some(PhysicalPlanType::ShuffleReader(
protobuf::ShuffleReaderExecNode {
stage_id,
partition,
schema: Some(exec.schema().as_ref().try_into()?),
partitioning: Some(partitioning),
},
)),
};
Expand All @@ -404,12 +438,16 @@ impl PhysicalExtensionCodec for BallistaPhysicalExtensionCodec {

Ok(())
} else if let Some(exec) = node.as_any().downcast_ref::<UnresolvedShuffleExec>() {
let default_codec =
datafusion_proto::physical_plan::DefaultPhysicalExtensionCodec {};
let partitioning =
serialize_partitioning(&exec.properties().partitioning, &default_codec)?;
let proto = protobuf::BallistaPhysicalPlanNode {
physical_plan_type: Some(PhysicalPlanType::UnresolvedShuffle(
protobuf::UnresolvedShuffleExecNode {
stage_id: exec.stage_id as u32,
schema: Some(exec.schema().as_ref().try_into()?),
output_partition_count: exec.output_partition_count as u32,
partitioning: Some(partitioning),
},
)),
};
Expand Down
15 changes: 6 additions & 9 deletions ballista/scheduler/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,7 @@ fn create_unresolved_shuffle(
Arc::new(UnresolvedShuffleExec::new(
shuffle_writer.stage_id(),
shuffle_writer.schema(),
shuffle_writer
.properties()
.output_partitioning()
.partition_count(),
shuffle_writer.properties().output_partitioning().clone(),
))
}

Expand Down Expand Up @@ -239,6 +236,10 @@ pub fn remove_unresolved_shuffles(
unresolved_shuffle.stage_id,
relevant_locations,
unresolved_shuffle.schema().clone(),
unresolved_shuffle
.properties()
.output_partitioning()
.clone(),
)?))
} else {
new_children.push(remove_unresolved_shuffles(
Expand All @@ -259,16 +260,12 @@ pub fn rollback_resolved_shuffles(
let mut new_children: Vec<Arc<dyn ExecutionPlan>> = vec![];
for child in stage.children() {
if let Some(shuffle_reader) = child.as_any().downcast_ref::<ShuffleReaderExec>() {
let output_partition_count = shuffle_reader
.properties()
.output_partitioning()
.partition_count();
let stage_id = shuffle_reader.stage_id;

let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new(
stage_id,
shuffle_reader.schema(),
output_partition_count,
shuffle_reader.properties().partitioning.clone(),
));
new_children.push(unresolved_shuffle);
} else {
Expand Down
Loading