From 91b1ff41830482f6ed0469bf38dd9919de4bba2b Mon Sep 17 00:00:00 2001
From: Aaron Patterson <tenderlove@ruby-lang.org>
Date: Fri, 12 Apr 2024 09:53:00 -0700
Subject: [PATCH] Optimized forwarding callers and callees

This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls.

Calls it optimizes look like this:

```ruby
def bar(a) = a
def foo(...) = bar(...) # optimized
foo(123)
```

```ruby
def bar(a) = a
def foo(...) = bar(1, 2, ...) # optimized
foo(123)
```

```ruby
def bar(*a) = a

def foo(...)
  list = [1, 2]
  bar(*list, ...) # optimized
end
foo(123)
```

All variants of the above but using `super` are also optimized, including a bare super like this:

```ruby
def foo(...)
  super
end
```

This patch eliminates intermediate allocations made when calling methods that accept `...`.
We can observe allocation elimination like this:

```ruby
def m
  x = GC.stat(:total_allocated_objects)
  yield
  GC.stat(:total_allocated_objects) - x
end

def bar(a) = a
def foo(...) = bar(...)

def test
  m { foo(123) }
end

test
p test # allocates 1 object on master, but 0 objects with this patch
```

```ruby
def bar(a, b:) = a + b
def foo(...) = bar(...)

def test
  m { foo(1, b: 2) }
end

test
p test # allocates 2 objects on master, but 0 objects with this patch
```

How does it work?
-----------------

This patch works by using a dynamic stack size when passing forwarded parameters to callees.
The caller's info object (known as the "CI") contains the stack size of the
parameters, so we pass the CI object itself as a parameter to the callee.
When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee.
The CI at the forwarded call site is adjusted using information from the caller's CI.

I think this description is kind of confusing, so let's walk through an example with code.

```ruby
def delegatee(a, b) = a + b

def delegator(...)
  delegatee(...)  # CI2 (FORWARDING)
end

def caller
  delegator(1, 2) # CI1 (argc: 2)
end
```

Before we call the delegator method, the stack looks like this:

```
Executing Line | Code                                  | Stack
---------------+---------------------------------------+--------
              1| def delegatee(a, b) = a + b           | self
              2|                                       | 1
              3| def delegator(...)                    | 2
              4|   #                                   |
              5|   delegatee(...)  # CI2 (FORWARDING)  |
              6| end                                   |
              7|                                       |
              8| def caller                            |
          ->  9|   delegator(1, 2) # CI1 (argc: 2)     |
             10| end                                   |
```

The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in
to `delegator`, it writes `CI1` on to the stack as a local variable for the
`delegator` method.  The `delegator` method has a special local called `...`
that holds the caller's CI object.

Here is the ISeq disasm fo `delegator`:

```
== disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)>
local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1])
[ 1] "..."@0
0000 putself                                                          (   1)[LiCa]
0001 getlocal_WC_0                          "..."@0
0003 send                                   <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil
0006 leave                                  [Re]
```

The local called `...` will contain the caller's CI: CI1.

Here is the stack when we enter `delegator`:

```
Executing Line | Code                                  | Stack
---------------+---------------------------------------+--------
              1| def delegatee(a, b) = a + b           | self
              2|                                       | 1
              3| def delegator(...)                    | 2
           -> 4|   #                                   | CI1 (argc: 2)
              5|   delegatee(...)  # CI2 (FORWARDING)  | cref_or_me
              6| end                                   | specval
              7|                                       | type
              8| def caller                            |
              9|   delegator(1, 2) # CI1 (argc: 2)     |
             10| end                                   |
```

The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to
memcopy the caller's stack before calling `delegatee`.  In this case, it will
memcopy self, 1, and 2 to the stack before calling `delegatee`.  It knows how much
memory to copy from the caller because `CI1` contains stack size information
(argc: 2).

Before executing the `send` instruction, we push `...` on the stack.  The
`send` instruction pops `...`, and because it is tagged with `FORWARDING`, it
knows to memcopy (using the information in the CI it just popped):

```
== disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)>
local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1])
[ 1] "..."@0
0000 putself                                                          (   1)[LiCa]
0001 getlocal_WC_0                          "..."@0
0003 send                                   <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil
0006 leave                                  [Re]
```

Instruction 001 puts the caller's CI on the stack.  `send` is tagged with
FORWARDING, so it reads the CI and _copies_ the callers stack to this stack:

```
Executing Line | Code                                  | Stack
---------------+---------------------------------------+--------
              1| def delegatee(a, b) = a + b           | self
              2|                                       | 1
              3| def delegator(...)                    | 2
              4|   #                                   | CI1 (argc: 2)
           -> 5|   delegatee(...)  # CI2 (FORWARDING)  | cref_or_me
              6| end                                   | specval
              7|                                       | type
              8| def caller                            | self
              9|   delegator(1, 2) # CI1 (argc: 2)     | 1
             10| end                                   | 2
```

The "FORWARDING" call site combines information from CI1 with CI2 in order
to support passing other values in addition to the `...` value, as well as
perfectly forward splat args, kwargs, etc.

Since we're able to copy the stack from `caller` in to `delegator`'s stack, we
can avoid allocating objects.

I want to do this to eliminate object allocations for delegate methods.
My long term goal is to implement `Class#new` in Ruby and it uses `...`.

I was able to implement `Class#new` in Ruby
[here](https://github.com/ruby/ruby/pull/9289).
If we adopt the technique in this patch, then we can optimize allocating
objects that take keyword parameters for `initialize`.

For example, this code will allocate 2 objects: one for `SomeObject`, and one
for the kwargs:

```ruby
SomeObject.new(foo: 1)
```

If we combine this technique, plus implement `Class#new` in Ruby, then we can
reduce allocations for this common operation.

Co-Authored-By: John Hawthorn <john@hawthorn.email>
Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
---
 bootstraptest/test_method.rb           | 132 ++++++++++++++++
 compile.c                              |  87 +++++++++--
 imemo.c                                |   5 +-
 insns.def                              |  27 +++-
 iseq.c                                 |  12 ++
 lib/prism/debug.rb                     |   6 +-
 parse.y                                |   2 +
 prism_compile.c                        |  72 ++++++---
 proc.c                                 |   2 +-
 rjit_c.rb                              |   1 +
 rubyparser.h                           |   1 +
 test/ruby/test_allocation.rb           |  52 +++----
 test/ruby/test_iseq.rb                 |  10 ++
 test/ruby/test_method.rb               |  14 ++
 test/ruby/test_yjit.rb                 |  10 +-
 tool/ruby_vm/views/_sp_inc_helpers.erb |   2 +-
 vm.c                                   |   9 +-
 vm_args.c                              |  53 +++++++
 vm_callinfo.h                          |  22 ++-
 vm_core.h                              |   1 +
 vm_insnhelper.c                        | 204 ++++++++++++++++++++++---
 vm_insnhelper.h                        |   5 +
 vm_method.c                            |   1 +
 yjit.c                                 |   6 +
 yjit/bindgen/src/main.rs               |   1 +
 yjit/src/codegen.rs                    |  32 +++-
 yjit/src/cruby.rs                      |   1 +
 yjit/src/cruby_bindings.inc.rs         |   4 +-
 yjit/src/stats.rs                      |   2 +
 29 files changed, 675 insertions(+), 101 deletions(-)

diff --git a/bootstraptest/test_method.rb b/bootstraptest/test_method.rb
index d1d1f57d55b0bb..fba8c679a61933 100644
--- a/bootstraptest/test_method.rb
+++ b/bootstraptest/test_method.rb
@@ -1176,3 +1176,135 @@ def foo
   foo
   foo
 }, '[Bug #20178]'
+
+assert_equal 'ok', %q{
+  def bar(x); x; end
+  def foo(...); bar(...); end
+  foo('ok')
+}
+
+assert_equal 'ok', %q{
+  def bar(x); x; end
+  def foo(z, ...); bar(...); end
+  foo(1, 'ok')
+}
+
+assert_equal 'ok', %q{
+  def bar(x, y); x; end
+  def foo(...); bar("ok", ...); end
+  foo(1)
+}
+
+assert_equal 'ok', %q{
+  def bar(x); x; end
+  def foo(...); 1.times { return bar(...) }; end
+  foo("ok")
+}
+
+assert_equal 'ok', %q{
+  def bar(x); x; end
+  def foo(...); x = nil; 1.times { x = bar(...) }; x; end
+  foo("ok")
+}
+
+assert_equal 'ok', %q{
+  def bar(x); yield; end
+  def foo(...); bar(...); end
+  foo(1) { "ok" }
+}
+
+assert_equal 'ok', %q{
+  def baz(x); x; end
+  def bar(...); baz(...); end
+  def foo(...); bar(...); end
+  foo("ok")
+}
+
+assert_equal '[1, 2, 3, 4]', %q{
+  def baz(a, b, c, d); [a, b, c, d]; end
+  def bar(...); baz(1, ...); end
+  def foo(...); bar(2, ...); end
+  foo(3, 4)
+}
+
+assert_equal 'ok', %q{
+  class Foo; def self.foo(x); x; end; end
+  class Bar < Foo; def self.foo(...); super; end; end
+  Bar.foo('ok')
+}
+
+assert_equal 'ok', %q{
+  class Foo; def self.foo(x); x; end; end
+  class Bar < Foo; def self.foo(...); super(...); end; end
+  Bar.foo('ok')
+}
+
+assert_equal 'ok', %q{
+  class Foo; def self.foo(x, y); x + y; end; end
+  class Bar < Foo; def self.foo(...); super("o", ...); end; end
+  Bar.foo('k')
+}
+
+assert_equal 'ok', %q{
+  def bar(a); a; end
+  def foo(...); lambda { bar(...) }; end
+  foo("ok").call
+}
+
+assert_equal 'ok', %q{
+  class Foo; def self.foo(x, y); x + y; end; end
+  class Bar < Foo; def self.y(&b); b; end; def self.foo(...); y { super("o", ...) }; end; end
+  Bar.foo('k').call
+}
+
+assert_equal 'ok', %q{
+  def baz(n); n; end
+  def foo(...); bar = baz(...); lambda { lambda { bar } }; end
+  foo("ok").call.call
+}
+
+assert_equal 'ok', %q{
+  class A; def self.foo(...); new(...); end; attr_reader :b; def initialize(a, b:"ng"); @a = a; @b = b; end end
+  A.foo(1).b
+  A.foo(1, b: "ok").b
+}
+
+assert_equal 'ok', %q{
+  class A; def initialize; @a = ["ok"]; end; def first(...); @a.first(...); end; end
+  def call x; x.first; end
+  def call1 x; x.first(1); end
+  call(A.new)
+  call1(A.new).first
+}
+
+assert_equal 'ok', %q{
+  class A; def foo; yield("o"); end; end
+  class B < A; def foo(...); super { |x| yield(x + "k") }; end; end
+  B.new.foo { |x| x }
+}
+
+assert_equal "[1, 2, 3, 4]", %q{
+  def foo(*b) = b
+
+  def forward(...)
+    splat = [1,2,3]
+    foo(*splat, ...)
+  end
+
+  forward(4)
+}
+
+assert_equal "[1, 2, 3, 4]", %q{
+class A
+  def foo(*b) = b
+end
+
+class B < A
+  def foo(...)
+    splat = [1,2,3]
+    super(*splat, ...)
+  end
+end
+
+B.new.foo(4)
+}
diff --git a/compile.c b/compile.c
index 3fb1566ad47d90..8de27cd9e17355 100644
--- a/compile.c
+++ b/compile.c
@@ -490,7 +490,7 @@ static int iseq_setup_insn(rb_iseq_t *iseq, LINK_ANCHOR *const anchor);
 static int iseq_optimize(rb_iseq_t *iseq, LINK_ANCHOR *const anchor);
 static int iseq_insns_unification(rb_iseq_t *iseq, LINK_ANCHOR *const anchor);
 
-static int iseq_set_local_table(rb_iseq_t *iseq, const rb_ast_id_table_t *tbl);
+static int iseq_set_local_table(rb_iseq_t *iseq, const rb_ast_id_table_t *tbl, const NODE *const node_args);
 static int iseq_set_exception_local_table(rb_iseq_t *iseq);
 static int iseq_set_arguments(rb_iseq_t *iseq, LINK_ANCHOR *const anchor, const NODE *const node);
 
@@ -871,12 +871,12 @@ rb_iseq_compile_node(rb_iseq_t *iseq, const NODE *node)
 
     if (node == 0) {
         NO_CHECK(COMPILE(ret, "nil", node));
-        iseq_set_local_table(iseq, 0);
+        iseq_set_local_table(iseq, 0, 0);
     }
     /* assume node is T_NODE */
     else if (nd_type_p(node, NODE_SCOPE)) {
         /* iseq type of top, method, class, block */
-        iseq_set_local_table(iseq, RNODE_SCOPE(node)->nd_tbl);
+        iseq_set_local_table(iseq, RNODE_SCOPE(node)->nd_tbl, (NODE *)RNODE_SCOPE(node)->nd_args);
         iseq_set_arguments(iseq, ret, (NODE *)RNODE_SCOPE(node)->nd_args);
 
         switch (ISEQ_BODY(iseq)->type) {
@@ -1441,7 +1441,7 @@ new_callinfo(rb_iseq_t *iseq, ID mid, int argc, unsigned int flag, struct rb_cal
 {
     VM_ASSERT(argc >= 0);
 
-    if (!(flag & (VM_CALL_ARGS_SPLAT | VM_CALL_ARGS_BLOCKARG | VM_CALL_KW_SPLAT)) &&
+    if (!(flag & (VM_CALL_ARGS_SPLAT | VM_CALL_ARGS_BLOCKARG | VM_CALL_KW_SPLAT | VM_CALL_FORWARDING)) &&
         kw_arg == NULL && !has_blockiseq) {
         flag |= VM_CALL_ARGS_SIMPLE;
     }
@@ -2024,6 +2024,13 @@ iseq_set_arguments(rb_iseq_t *iseq, LINK_ANCHOR *const optargs, const NODE *cons
         }
         block_id = args->block_arg;
 
+        bool optimized_forward = (args->forwarding && args->pre_args_num == 0 && !args->opt_args);
+
+        if (optimized_forward) {
+            rest_id = 0;
+            block_id = 0;
+        }
+
         if (args->opt_args) {
             const rb_node_opt_arg_t *node = args->opt_args;
             LABEL *label;
@@ -2080,7 +2087,7 @@ iseq_set_arguments(rb_iseq_t *iseq, LINK_ANCHOR *const optargs, const NODE *cons
         if (args->kw_args) {
             arg_size = iseq_set_arguments_keywords(iseq, optargs, args, arg_size);
         }
-        else if (args->kw_rest_arg) {
+        else if (args->kw_rest_arg && !optimized_forward) {
             ID kw_id = iseq->body->local_table[arg_size];
             struct rb_iseq_param_keyword *keyword = ZALLOC_N(struct rb_iseq_param_keyword, 1);
             keyword->rest_start = arg_size++;
@@ -2100,6 +2107,12 @@ iseq_set_arguments(rb_iseq_t *iseq, LINK_ANCHOR *const optargs, const NODE *cons
             body->param.flags.has_block = TRUE;
         }
 
+        // Only optimize specifically methods like this: `foo(...)`
+        if (optimized_forward) {
+            body->param.flags.forwardable = TRUE;
+            arg_size = 1;
+        }
+
         iseq_calc_param_size(iseq);
         body->param.size = arg_size;
 
@@ -2129,13 +2142,26 @@ iseq_set_arguments(rb_iseq_t *iseq, LINK_ANCHOR *const optargs, const NODE *cons
 }
 
 static int
-iseq_set_local_table(rb_iseq_t *iseq, const rb_ast_id_table_t *tbl)
+iseq_set_local_table(rb_iseq_t *iseq, const rb_ast_id_table_t *tbl, const NODE *const node_args)
 {
     unsigned int size = tbl ? tbl->size : 0;
+    unsigned int offset = 0;
+
+    if (node_args) {
+        struct rb_args_info *args = &RNODE_ARGS(node_args)->nd_ainfo;
+
+        // If we have a function that only has `...` as the parameter,
+        // then its local table should only be `...`
+        // FIXME: I think this should be fixed in the AST rather than special case here.
+        if (args->forwarding && args->pre_args_num == 0 && !args->opt_args) {
+            size -= 3;
+            offset += 3;
+        }
+    }
 
     if (size > 0) {
         ID *ids = (ID *)ALLOC_N(ID, size);
-        MEMCPY(ids, tbl->ids, ID, size);
+        MEMCPY(ids, tbl->ids + offset, ID, size);
         ISEQ_BODY(iseq)->local_table = ids;
     }
     ISEQ_BODY(iseq)->local_table_size = size;
@@ -4110,7 +4136,7 @@ iseq_specialized_instruction(rb_iseq_t *iseq, INSN *iobj)
             }
         }
 
-        if ((vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) == 0 && blockiseq == NULL) {
+        if ((vm_ci_flag(ci) & (VM_CALL_ARGS_BLOCKARG | VM_CALL_FORWARDING)) == 0 && blockiseq == NULL) {
             iobj->insn_id = BIN(opt_send_without_block);
             iobj->operand_size = insn_len(iobj->insn_id) - 1;
         }
@@ -6272,9 +6298,33 @@ setup_args(rb_iseq_t *iseq, LINK_ANCHOR *const args, const NODE *argn,
         unsigned int dup_rest = 1;
         DECL_ANCHOR(arg_block);
         INIT_ANCHOR(arg_block);
-        NO_CHECK(COMPILE(arg_block, "block", RNODE_BLOCK_PASS(argn)->nd_body));
 
-        *flag |= VM_CALL_ARGS_BLOCKARG;
+        if (RNODE_BLOCK_PASS(argn)->forwarding && ISEQ_BODY(ISEQ_BODY(iseq)->local_iseq)->param.flags.forwardable) {
+            int idx = ISEQ_BODY(ISEQ_BODY(iseq)->local_iseq)->local_table_size;// - get_local_var_idx(iseq, idDot3);
+
+            RUBY_ASSERT(nd_type_p(RNODE_BLOCK_PASS(argn)->nd_head, NODE_ARGSPUSH));
+            const NODE * arg_node =
+                RNODE_ARGSPUSH(RNODE_BLOCK_PASS(argn)->nd_head)->nd_head;
+
+            int argc = 0;
+
+            // Only compile leading args:
+            //   foo(x, y, ...)
+            //       ^^^^
+            if (nd_type_p(arg_node, NODE_ARGSCAT)) {
+                argc += setup_args_core(iseq, args, RNODE_ARGSCAT(arg_node)->nd_head, dup_rest, flag, keywords);
+            }
+
+            *flag |= VM_CALL_FORWARDING;
+
+            ADD_GETLOCAL(args, argn, idx, get_lvar_level(iseq));
+            return INT2FIX(argc);
+        }
+        else {
+            *flag |= VM_CALL_ARGS_BLOCKARG;
+
+            NO_CHECK(COMPILE(arg_block, "block", RNODE_BLOCK_PASS(argn)->nd_body));
+        }
 
         if (LIST_INSN_SIZE_ONE(arg_block)) {
             LINK_ELEMENT *elem = FIRST_ELEMENT(arg_block);
@@ -6306,7 +6356,7 @@ build_postexe_iseq(rb_iseq_t *iseq, LINK_ANCHOR *ret, const void *ptr)
     ADD_INSN1(ret, body, putspecialobject, INT2FIX(VM_SPECIAL_OBJECT_VMCORE));
     ADD_CALL_WITH_BLOCK(ret, body, id_core_set_postexe, argc, block);
     RB_OBJ_WRITTEN(iseq, Qundef, (VALUE)block);
-    iseq_set_local_table(iseq, 0);
+    iseq_set_local_table(iseq, 0, 0);
 }
 
 static void
@@ -9391,6 +9441,13 @@ compile_super(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, i
             ADD_GETLOCAL(args, node, idx, lvar_level);
         }
 
+        /* forward ... */
+        if (local_body->param.flags.forwardable) {
+            flag |= VM_CALL_FORWARDING;
+            int idx = local_body->local_table_size - get_local_var_idx(liseq, idDot3);
+            ADD_GETLOCAL(args, node, idx, lvar_level);
+        }
+
         if (local_body->param.flags.has_opt) {
             /* optional arguments */
             int j;
@@ -12935,7 +12992,8 @@ ibf_dump_iseq_each(struct ibf_dump *dump, const rb_iseq_t *iseq)
         (body->param.flags.has_block        << 6) |
         (body->param.flags.ambiguous_param0 << 7) |
         (body->param.flags.accepts_no_kwarg << 8) |
-        (body->param.flags.ruby2_keywords   << 9);
+        (body->param.flags.ruby2_keywords   << 9) |
+        (body->param.flags.forwardable     << 10);
 
 #if IBF_ISEQ_ENABLE_LOCAL_BUFFER
 #  define IBF_BODY_OFFSET(x) (x)
@@ -13149,8 +13207,9 @@ ibf_load_iseq_each(struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t offset)
     load_body->param.flags.ambiguous_param0 = (param_flags >> 7) & 1;
     load_body->param.flags.accepts_no_kwarg = (param_flags >> 8) & 1;
     load_body->param.flags.ruby2_keywords = (param_flags >> 9) & 1;
-    load_body->param.flags.anon_rest = (param_flags >> 10) & 1;
-    load_body->param.flags.anon_kwrest = (param_flags >> 11) & 1;
+    load_body->param.flags.forwardable = (param_flags >> 10) & 1;
+    load_body->param.flags.anon_rest = (param_flags >> 11) & 1;
+    load_body->param.flags.anon_kwrest = (param_flags >> 12) & 1;
     load_body->param.size = param_size;
     load_body->param.lead_num = param_lead_num;
     load_body->param.opt_num = param_opt_num;
diff --git a/imemo.c b/imemo.c
index 0031b3322ccf8b..0dd0f9ffa49850 100644
--- a/imemo.c
+++ b/imemo.c
@@ -315,8 +315,9 @@ rb_imemo_mark_and_move(VALUE obj, bool reference_updating)
             }
         }
         else {
-            if (vm_cc_super_p(cc) || vm_cc_refinement_p(cc)) {
+            if (vm_cc_orphan_p(cc) || vm_cc_refinement_p(cc)) {
                 rb_gc_mark_movable((VALUE)cc->cme_);
+                rb_gc_mark_movable((VALUE)cc->klass);
             }
         }
 
@@ -471,7 +472,7 @@ vm_ccs_free(struct rb_class_cc_entries *ccs, int alive, VALUE klass)
                 }
             }
 
-            VM_ASSERT(!vm_cc_super_p(cc) && !vm_cc_refinement_p(cc));
+            VM_ASSERT(!vm_cc_orphan_p(cc) && !vm_cc_refinement_p(cc));
             vm_cc_invalidate(cc);
         }
         ruby_xfree(ccs->entries);
diff --git a/insns.def b/insns.def
index 9c649904b88278..550e0a8952c524 100644
--- a/insns.def
+++ b/insns.def
@@ -867,10 +867,22 @@ send
 // attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
 // attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
 {
-    VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, false);
+    struct rb_forwarding_call_data adjusted_cd;
+    struct rb_callinfo adjusted_ci;
+
+    CALL_DATA _cd = cd;
+
+    VALUE bh = vm_caller_setup_args(GET_EC(), GET_CFP(), &cd, blockiseq, 0, &adjusted_cd, &adjusted_ci);
+
     val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
     JIT_EXEC(ec, val);
 
+    if (vm_ci_flag(_cd->ci) & VM_CALL_FORWARDING) {
+      if (_cd->cc != cd->cc && vm_cc_markable(cd->cc)) {
+          RB_OBJ_WRITE(GET_ISEQ(), &_cd->cc, cd->cc);
+      }
+    }
+
     if (UNDEF_P(val)) {
         RESTORE_REGS();
         NEXT_INSN();
@@ -991,10 +1003,21 @@ invokesuper
 // attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
 // attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
 {
-    VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, true);
+    CALL_DATA _cd = cd;
+    struct rb_forwarding_call_data adjusted_cd;
+    struct rb_callinfo adjusted_ci;
+
+    VALUE bh = vm_caller_setup_args(GET_EC(), GET_CFP(), &cd, blockiseq, 1, &adjusted_cd, &adjusted_ci);
+
     val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_super);
     JIT_EXEC(ec, val);
 
+    if (vm_ci_flag(_cd->ci) & VM_CALL_FORWARDING) {
+      if (_cd->cc != cd->cc && vm_cc_markable(cd->cc)) {
+          RB_OBJ_WRITE(GET_ISEQ(), &_cd->cc, cd->cc);
+      }
+    }
+
     if (UNDEF_P(val)) {
         RESTORE_REGS();
         NEXT_INSN();
diff --git a/iseq.c b/iseq.c
index 3659dab2f516a7..4339fec28416bd 100644
--- a/iseq.c
+++ b/iseq.c
@@ -2404,6 +2404,7 @@ rb_insn_operand_intern(const rb_iseq_t *iseq,
                 CALL_FLAG(KWARG);
                 CALL_FLAG(KW_SPLAT);
                 CALL_FLAG(KW_SPLAT_MUT);
+                CALL_FLAG(FORWARDING);
                 CALL_FLAG(OPT_SEND); /* maybe not reachable */
                 rb_ary_push(ary, rb_ary_join(flags, rb_str_new2("|")));
             }
@@ -3449,6 +3450,17 @@ rb_iseq_parameters(const rb_iseq_t *iseq, int is_proc)
 
     CONST_ID(req, "req");
     CONST_ID(opt, "opt");
+
+    if (body->param.flags.forwardable) {
+        // [[:rest, :*], [:keyrest, :**], [:block, :&]]
+        CONST_ID(rest, "rest");
+        CONST_ID(keyrest, "keyrest");
+        CONST_ID(block, "block");
+        rb_ary_push(args, rb_ary_new_from_args(2, ID2SYM(rest), ID2SYM(idMULT)));
+        rb_ary_push(args, rb_ary_new_from_args(2, ID2SYM(keyrest), ID2SYM(idPow)));
+        rb_ary_push(args, rb_ary_new_from_args(2, ID2SYM(block), ID2SYM(idAnd)));
+    }
+
     if (is_proc) {
         for (i = 0; i < body->param.lead_num; i++) {
             PARAM_TYPE(opt);
diff --git a/lib/prism/debug.rb b/lib/prism/debug.rb
index 74f824faa70d4f..3f202311dc6a96 100644
--- a/lib/prism/debug.rb
+++ b/lib/prism/debug.rb
@@ -141,7 +141,11 @@ def self.prism_locals(source)
             sorted << AnonymousLocal if params.keywords.any?
 
             if params.keyword_rest.is_a?(ForwardingParameterNode)
-              sorted.push(:*, :**, :&, :"...")
+              if sorted.length == 0
+                sorted.push(:"...")
+              else
+                sorted.push(:*, :**, :&, :"...")
+              end
             elsif params.keyword_rest.is_a?(KeywordRestParameterNode)
               sorted << (params.keyword_rest.name || :**)
             end
diff --git a/parse.y b/parse.y
index 3665f9b7c07fd6..7be290b2c441b2 100644
--- a/parse.y
+++ b/parse.y
@@ -12486,6 +12486,7 @@ static rb_node_block_pass_t *
 rb_node_block_pass_new(struct parser_params *p, NODE *nd_body, const YYLTYPE *loc)
 {
     rb_node_block_pass_t *n = NODE_NEWNODE(NODE_BLOCK_PASS, rb_node_block_pass_t, loc);
+    n->forwarding = 0;
     n->nd_head = 0;
     n->nd_body = nd_body;
 
@@ -15347,6 +15348,7 @@ new_args_forward_call(struct parser_params *p, NODE *leading, const YYLTYPE *loc
 #endif
     rb_node_block_pass_t *block = NEW_BLOCK_PASS(NEW_LVAR(idFWD_BLOCK, loc), loc);
     NODE *args = leading ? rest_arg_append(p, leading, rest, argsloc) : NEW_SPLAT(rest, loc);
+    block->forwarding = TRUE;
 #ifndef FORWARD_ARGS_WITH_RUBY2_KEYWORDS
     args = arg_append(p, args, new_hash(p, kwrest, loc), loc);
 #endif
diff --git a/prism_compile.c b/prism_compile.c
index 2c50c19c1dfb26..e5c6b1c3926656 100644
--- a/prism_compile.c
+++ b/prism_compile.c
@@ -1407,7 +1407,17 @@ pm_setup_args_core(const pm_arguments_node_t *arguments_node, const pm_node_t *b
                 break;
               }
               case PM_FORWARDING_ARGUMENTS_NODE: {
+                if (ISEQ_BODY(ISEQ_BODY(iseq)->local_iseq)->param.flags.forwardable) {
+                    *flags |= VM_CALL_FORWARDING;
+
+                    pm_local_index_t mult_local = pm_lookup_local_index(iseq, scope_node, PM_CONSTANT_DOT3, 0);
+                    PUSH_GETLOCAL(ret, location, mult_local.index, mult_local.level);
+
+                    break;
+                }
+
                 orig_argc += 2;
+
                 *flags |= VM_CALL_ARGS_SPLAT | VM_CALL_ARGS_SPLAT_MUT | VM_CALL_ARGS_BLOCKARG | VM_CALL_KW_SPLAT;
 
                 // Forwarding arguments nodes are treated as foo(*, **, &)
@@ -5827,6 +5837,15 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
         int argc = 0;
         int depth = get_lvar_level(iseq);
 
+        if (ISEQ_BODY(ISEQ_BODY(iseq)->local_iseq)->param.flags.forwardable) {
+            flag |= VM_CALL_FORWARDING;
+            pm_local_index_t mult_local = pm_lookup_local_index(iseq, scope_node, PM_CONSTANT_DOT3, 0);
+            PUSH_GETLOCAL(ret, location, mult_local.index, mult_local.level);
+            PUSH_INSN2(ret, location, invokesuper, new_callinfo(iseq, 0, 0, flag, NULL, block != NULL), block);
+            if (popped) PUSH_INSN(ret, location, pop);
+            return;
+        }
+
         if (local_body->param.flags.has_lead) {
             /* required arguments */
             for (int i = 0; i < local_body->param.lead_num; i++) {
@@ -7406,7 +7425,14 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
                 // When we have a `...` as the keyword_rest, it's a forwarding_parameter_node and
                 // we need to leave space for 4 locals: *, **, &, ...
                 if (PM_NODE_TYPE_P(parameters_node->keyword_rest, PM_FORWARDING_PARAMETER_NODE)) {
-                    table_size += 4;
+                    // Only optimize specifically methods like this: `foo(...)`
+                    if (requireds_list->size == 0 && optionals_list->size == 0 && keywords_list->size == 0) {
+                        ISEQ_BODY(iseq)->param.flags.forwardable = TRUE;
+                        table_size += 1;
+                    }
+                    else {
+                        table_size += 4;
+                    }
                 }
                 else {
                     const pm_keyword_rest_parameter_node_t *kw_rest = (const pm_keyword_rest_parameter_node_t *) parameters_node->keyword_rest;
@@ -7755,29 +7781,31 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
                   // def foo(...)
                   //         ^^^
                   case PM_FORWARDING_PARAMETER_NODE: {
-                    body->param.rest_start = local_index;
-                    body->param.flags.has_rest = true;
+                    if (!ISEQ_BODY(iseq)->param.flags.forwardable) {
+                        body->param.rest_start = local_index;
+                        body->param.flags.has_rest = true;
 
-                    // Add the leading *
-                    pm_insert_local_special(idMULT, local_index++, index_lookup_table, local_table_for_iseq);
+                        // Add the leading *
+                        pm_insert_local_special(idMULT, local_index++, index_lookup_table, local_table_for_iseq);
 
-                    // Add the kwrest **
-                    RUBY_ASSERT(!body->param.flags.has_kw);
+                        // Add the kwrest **
+                        RUBY_ASSERT(!body->param.flags.has_kw);
 
-                    // There are no keywords declared (in the text of the program)
-                    // but the forwarding node implies we support kwrest (**)
-                    body->param.flags.has_kw = false;
-                    body->param.flags.has_kwrest = true;
-                    body->param.keyword = keyword = ZALLOC_N(struct rb_iseq_param_keyword, 1);
+                        // There are no keywords declared (in the text of the program)
+                        // but the forwarding node implies we support kwrest (**)
+                        body->param.flags.has_kw = false;
+                        body->param.flags.has_kwrest = true;
+                        body->param.keyword = keyword = ZALLOC_N(struct rb_iseq_param_keyword, 1);
 
-                    keyword->rest_start = local_index;
+                        keyword->rest_start = local_index;
 
-                    pm_insert_local_special(idPow, local_index++, index_lookup_table, local_table_for_iseq);
+                        pm_insert_local_special(idPow, local_index++, index_lookup_table, local_table_for_iseq);
 
-                    body->param.block_start = local_index;
-                    body->param.flags.has_block = true;
+                        body->param.block_start = local_index;
+                        body->param.flags.has_block = true;
 
-                    pm_insert_local_special(idAnd, local_index++, index_lookup_table, local_table_for_iseq);
+                        pm_insert_local_special(idAnd, local_index++, index_lookup_table, local_table_for_iseq);
+                    }
                     pm_insert_local_special(idDot3, local_index++, index_lookup_table, local_table_for_iseq);
                     break;
                   }
@@ -7933,7 +7961,15 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
         }
         scope_node->index_lookup_table = index_lookup_table;
         iseq_calc_param_size(iseq);
-        iseq_set_local_table(iseq, local_table_for_iseq);
+
+        if (ISEQ_BODY(iseq)->param.flags.forwardable) {
+            // We're treating `...` as a parameter so that frame
+            // pushing won't clobber it.
+            ISEQ_BODY(iseq)->param.size += 1;
+        }
+
+        // FIXME: args?
+        iseq_set_local_table(iseq, local_table_for_iseq, 0);
         scope_node->local_table_for_iseq_size = local_table_for_iseq->size;
 
         //********STEP 5************
diff --git a/proc.c b/proc.c
index bfde0d9abbb44c..06082642362d5f 100644
--- a/proc.c
+++ b/proc.c
@@ -1050,7 +1050,7 @@ rb_iseq_min_max_arity(const rb_iseq_t *iseq, int *max)
 {
     *max = ISEQ_BODY(iseq)->param.flags.has_rest == FALSE ?
       ISEQ_BODY(iseq)->param.lead_num + ISEQ_BODY(iseq)->param.opt_num + ISEQ_BODY(iseq)->param.post_num +
-      (ISEQ_BODY(iseq)->param.flags.has_kw == TRUE || ISEQ_BODY(iseq)->param.flags.has_kwrest == TRUE)
+      (ISEQ_BODY(iseq)->param.flags.has_kw == TRUE || ISEQ_BODY(iseq)->param.flags.has_kwrest == TRUE || ISEQ_BODY(iseq)->param.flags.forwardable == TRUE)
       : UNLIMITED_ARGUMENTS;
     return ISEQ_BODY(iseq)->param.lead_num + ISEQ_BODY(iseq)->param.post_num + (ISEQ_BODY(iseq)->param.flags.has_kw && ISEQ_BODY(iseq)->param.keyword->required_num > 0);
 }
diff --git a/rjit_c.rb b/rjit_c.rb
index 0ba33b40bf904d..2d265227e4599f 100644
--- a/rjit_c.rb
+++ b/rjit_c.rb
@@ -1093,6 +1093,7 @@ def C.rb_iseq_constant_body
           ruby2_keywords: [CType::BitField.new(1, 1), 9],
           anon_rest: [CType::BitField.new(1, 2), 10],
           anon_kwrest: [CType::BitField.new(1, 3), 11],
+          forwardable: [CType::BitField.new(1, 4), 12],
         ), Primitive.cexpr!("OFFSETOF(((struct rb_iseq_constant_body *)NULL)->param, flags)")],
         size: [CType::Immediate.parse("unsigned int"), Primitive.cexpr!("OFFSETOF(((struct rb_iseq_constant_body *)NULL)->param, size)")],
         lead_num: [CType::Immediate.parse("int"), Primitive.cexpr!("OFFSETOF(((struct rb_iseq_constant_body *)NULL)->param, lead_num)")],
diff --git a/rubyparser.h b/rubyparser.h
index 5ba2da8de11f98..e64bae7ce6ab1e 100644
--- a/rubyparser.h
+++ b/rubyparser.h
@@ -855,6 +855,7 @@ typedef struct RNode_BLOCK_PASS {
 
     struct RNode *nd_head;
     struct RNode *nd_body;
+    unsigned int forwarding: 1;
 } rb_node_block_pass_t;
 
 typedef struct RNode_DEFN {
diff --git a/test/ruby/test_allocation.rb b/test/ruby/test_allocation.rb
index 48348c0fbd514d..21f389a06fe476 100644
--- a/test/ruby/test_allocation.rb
+++ b/test/ruby/test_allocation.rb
@@ -569,20 +569,20 @@ def test_argument_forwarding
       check_allocations(<<~RUBY)
         def self.argument_forwarding(...); end
 
-        check_allocations(1, 1, "argument_forwarding(1, a: 2#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array, a: 2#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, a:2, **empty_hash#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **empty_hash, a: 2#{block})")
-
-        check_allocations(1, 0, "argument_forwarding(1, **nil#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, **empty_hash#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, **hash1#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array, **hash1#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **empty_hash, **hash1#{block})")
-
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, a: 2#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array, a: 2#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, a:2, **empty_hash#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **empty_hash, a: 2#{block})")
+
+        check_allocations(0, 0, "argument_forwarding(1, **nil#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, **empty_hash#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, **hash1#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array, **hash1#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **empty_hash, **hash1#{block})")
+
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
         check_allocations(1, 0, "argument_forwarding(1, *empty_array, *empty_array, **empty_hash#{block})")
 
         check_allocations(0, 0, "argument_forwarding(*array1, a: 2#{block})")
@@ -610,20 +610,20 @@ def test_nested_argument_forwarding
       check_allocations(<<~RUBY)
         def self.argument_forwarding(...); t(...) end; def self.t(...) end
 
-        check_allocations(1, 1, "argument_forwarding(1, a: 2#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array, a: 2#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, a:2, **empty_hash#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **empty_hash, a: 2#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, a: 2#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array, a: 2#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, a:2, **empty_hash#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **empty_hash, a: 2#{block})")
 
-        check_allocations(1, 0, "argument_forwarding(1, **nil#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, **empty_hash#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, **hash1#{block})")
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array, **hash1#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **empty_hash, **hash1#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, **nil#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, **empty_hash#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, **hash1#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array, **hash1#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **empty_hash, **hash1#{block})")
 
-        check_allocations(1, 0, "argument_forwarding(1, *empty_array#{block})")
-        check_allocations(1, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
+        check_allocations(0, 0, "argument_forwarding(1, *empty_array#{block})")
+        check_allocations(0, 1, "argument_forwarding(1, **hash1, **empty_hash#{block})")
         check_allocations(1, 0, "argument_forwarding(1, *empty_array, *empty_array, **empty_hash#{block})")
 
         check_allocations(0, 0, "argument_forwarding(*array1, a: 2#{block})")
diff --git a/test/ruby/test_iseq.rb b/test/ruby/test_iseq.rb
index dc84d8bd7c4b51..e52749d57439af 100644
--- a/test/ruby/test_iseq.rb
+++ b/test/ruby/test_iseq.rb
@@ -95,6 +95,16 @@ def test_cdhash_after_roundtrip
     assert_equal(42, ISeq.load_from_binary(iseq.to_binary).eval)
   end
 
+  def test_forwardable
+    iseq = compile(<<~EOF, __LINE__+1)
+      Class.new {
+        def bar(a, b); a + b; end
+        def foo(...); bar(...); end
+      }
+    EOF
+    assert_equal(42, ISeq.load_from_binary(iseq.to_binary).eval.new.foo(40, 2))
+  end
+
   def test_super_with_block
     iseq = compile(<<~EOF, __LINE__+1)
       def (Object.new).touch(*) # :nodoc:
diff --git a/test/ruby/test_method.rb b/test/ruby/test_method.rb
index ec06f4c50a150d..80dbd2099584b7 100644
--- a/test/ruby/test_method.rb
+++ b/test/ruby/test_method.rb
@@ -1623,4 +1623,18 @@ def test_kwarg_eval_memory_leak
       end
     RUBY
   end
+
+  def test_super_with_splat
+    c = Class.new {
+      attr_reader :x
+
+      def initialize(*args)
+        @x, _ = args
+      end
+    }
+    b = Class.new(c) { def initialize(...) = super }
+    a = Class.new(b) { def initialize(*args) = super }
+    obj = a.new(1, 2, 3)
+    assert_equal 1, obj.x
+  end
 end
diff --git a/test/ruby/test_yjit.rb b/test/ruby/test_yjit.rb
index df71dbffc0c318..76b66572e39c73 100644
--- a/test/ruby/test_yjit.rb
+++ b/test/ruby/test_yjit.rb
@@ -1574,21 +1574,19 @@ def test_send_polymorphic_method_name
   end
 
   def test_kw_splat_nil
-    assert_compiles(<<~'RUBY', result: %i[ok ok ok], no_send_fallbacks: true)
+    assert_compiles(<<~'RUBY', result: %i[ok ok], no_send_fallbacks: true)
       def id(x) = x
       def kw_fw(arg, **) = id(arg, **)
-      def fw(...) = id(...)
-      def use = [fw(:ok), kw_fw(:ok), :ok.itself(**nil)]
+      def use = [kw_fw(:ok), :ok.itself(**nil)]
 
       use
     RUBY
   end
 
   def test_empty_splat
-    assert_compiles(<<~'RUBY', result: %i[ok ok], no_send_fallbacks: true)
+    assert_compiles(<<~'RUBY', result: :ok, no_send_fallbacks: true)
       def foo = :ok
-      def fw(...) = foo(...)
-      def use(empty) = [foo(*empty), fw]
+      def use(empty) = foo(*empty)
 
       use([])
     RUBY
diff --git a/tool/ruby_vm/views/_sp_inc_helpers.erb b/tool/ruby_vm/views/_sp_inc_helpers.erb
index d0b0bd79efcff2..740fe101420033 100644
--- a/tool/ruby_vm/views/_sp_inc_helpers.erb
+++ b/tool/ruby_vm/views/_sp_inc_helpers.erb
@@ -18,7 +18,7 @@ sp_inc_of_sendish(const struct rb_callinfo *ci)
      * 3. Pop receiver.
      * 4. Push return value.
      */
-    const int argb = (vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) ? 1 : 0;
+    const int argb = (vm_ci_flag(ci) & (VM_CALL_ARGS_BLOCKARG | VM_CALL_FORWARDING)) ? 1 : 0;
     const int argc = vm_ci_argc(ci);
     const int recv = 1;
     const int retn = 1;
diff --git a/vm.c b/vm.c
index 328187c790a327..ed012acb925f48 100644
--- a/vm.c
+++ b/vm.c
@@ -961,7 +961,14 @@ vm_make_env_each(const rb_execution_context_t * const ec, rb_control_frame_t *co
         local_size = VM_ENV_DATA_SIZE;
     }
     else {
-        local_size = ISEQ_BODY(cfp->iseq)->local_table_size + VM_ENV_DATA_SIZE;
+        local_size = ISEQ_BODY(cfp->iseq)->local_table_size;
+        if (ISEQ_BODY(cfp->iseq)->param.flags.forwardable && VM_ENV_LOCAL_P(cfp->ep)) {
+            int ci_offset = local_size - ISEQ_BODY(cfp->iseq)->param.size + VM_ENV_DATA_SIZE;
+
+            CALL_INFO ci = (CALL_INFO)VM_CF_LEP(cfp)[-ci_offset];
+            local_size += vm_ci_argc(ci);
+        }
+        local_size += VM_ENV_DATA_SIZE;
     }
 
     /*
diff --git a/vm_args.c b/vm_args.c
index 9df175eaa94a79..1394162ab12295 100644
--- a/vm_args.c
+++ b/vm_args.c
@@ -1047,3 +1047,56 @@ vm_caller_setup_arg_block(const rb_execution_context_t *ec, rb_control_frame_t *
         }
     }
 }
+
+static void vm_adjust_stack_forwarding(const struct rb_execution_context_struct *ec, struct rb_control_frame_struct *cfp, CALL_INFO callers_info, VALUE splat);
+
+static VALUE
+vm_caller_setup_args(const rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
+                    CALL_DATA *cd, const rb_iseq_t *blockiseq, const int is_super,
+                    struct rb_forwarding_call_data *adjusted_cd, struct rb_callinfo *adjusted_ci)
+{
+    CALL_INFO site_ci = (*cd)->ci;
+    VALUE bh = Qundef;
+
+    if (vm_ci_flag(site_ci) & VM_CALL_FORWARDING) {
+        RUBY_ASSERT(ISEQ_BODY(ISEQ_BODY(GET_ISEQ())->local_iseq)->param.flags.forwardable);
+        CALL_INFO caller_ci = (CALL_INFO)TOPN(0);
+
+        unsigned int forwarding_argc = vm_ci_argc(site_ci);
+        VALUE splat = Qfalse;
+
+        if (vm_ci_flag(site_ci) & VM_CALL_ARGS_SPLAT) {
+            // If we're called with args_splat, the top 1 should be an array
+            splat = TOPN(1);
+            forwarding_argc += (RARRAY_LEN(splat) - 1);
+        }
+
+        // Need to setup the block in case of e.g. `super { :block }`
+        if (is_super && blockiseq) {
+            bh = vm_caller_setup_arg_block(ec, GET_CFP(), site_ci, blockiseq, is_super);
+        }
+        else {
+            bh = VM_ENV_BLOCK_HANDLER(GET_LEP());
+        }
+
+        vm_adjust_stack_forwarding(ec, GET_CFP(), caller_ci, splat);
+
+        *adjusted_ci = VM_CI_ON_STACK(
+            vm_ci_mid(site_ci),
+            (vm_ci_flag(caller_ci) | (vm_ci_flag(site_ci) & (VM_CALL_FCALL | VM_CALL_FORWARDING))),
+            forwarding_argc + vm_ci_argc(caller_ci),
+            vm_ci_kwarg(caller_ci)
+        );
+
+        adjusted_cd->cd.ci = adjusted_ci;
+        adjusted_cd->cd.cc = (*cd)->cc;
+        adjusted_cd->caller_ci = caller_ci;
+
+        *cd = &adjusted_cd->cd;
+    }
+    else {
+        bh = vm_caller_setup_arg_block(ec, GET_CFP(), site_ci, blockiseq, is_super);
+    }
+
+    return bh;
+}
diff --git a/vm_callinfo.h b/vm_callinfo.h
index 21e0755aa80a1c..80b9529e91986e 100644
--- a/vm_callinfo.h
+++ b/vm_callinfo.h
@@ -26,6 +26,7 @@ enum vm_call_flag_bits {
     VM_CALL_OPT_SEND_bit,       // internal flag
     VM_CALL_KW_SPLAT_MUT_bit,   // kw splat hash can be modified (to avoid allocating a new one)
     VM_CALL_ARGS_SPLAT_MUT_bit, // args splat can be modified (to avoid allocating a new one)
+    VM_CALL_FORWARDING_bit,     // m(...)
     VM_CALL__END
 };
 
@@ -42,6 +43,7 @@ enum vm_call_flag_bits {
 #define VM_CALL_OPT_SEND        (0x01 << VM_CALL_OPT_SEND_bit)
 #define VM_CALL_KW_SPLAT_MUT    (0x01 << VM_CALL_KW_SPLAT_MUT_bit)
 #define VM_CALL_ARGS_SPLAT_MUT  (0x01 << VM_CALL_ARGS_SPLAT_MUT_bit)
+#define VM_CALL_FORWARDING      (0x01 << VM_CALL_FORWARDING_bit)
 
 struct rb_callinfo_kwarg {
     int keyword_len;
@@ -255,6 +257,12 @@ vm_ci_markable(const struct rb_callinfo *ci)
     }
 }
 
+static inline bool
+vm_ci_cacheable(const struct rb_callinfo *ci)
+{
+    return vm_ci_markable(ci) && !(vm_ci_flag(ci) & VM_CALL_FORWARDING);
+}
+
 #define VM_CI_ON_STACK(mid_, flags_, argc_, kwarg_) \
     (struct rb_callinfo) {                          \
         .flags = T_IMEMO |                          \
@@ -301,12 +309,12 @@ struct rb_callcache {
 /* VM_CALLCACHE_IVAR used for IVAR/ATTRSET/STRUCT_AREF/STRUCT_ASET methods */
 #define VM_CALLCACHE_IVAR       IMEMO_FL_USER0
 #define VM_CALLCACHE_BF         IMEMO_FL_USER1
-#define VM_CALLCACHE_SUPER      IMEMO_FL_USER2
+#define VM_CALLCACHE_ORPHAN     IMEMO_FL_USER2
 #define VM_CALLCACHE_REFINEMENT IMEMO_FL_USER3
 
 enum vm_cc_type {
     cc_type_normal, // chained from ccs
-    cc_type_super,
+    cc_type_orphan,
     cc_type_refinement,
 };
 
@@ -333,11 +341,13 @@ vm_cc_new(VALUE klass,
     *((struct rb_callable_method_entry_struct **)&cc->cme_) = (struct rb_callable_method_entry_struct *)cme;
     *((vm_call_handler *)&cc->call_) = call;
 
+    VM_ASSERT(RB_TYPE_P(klass, T_CLASS) || RB_TYPE_P(klass, T_ICLASS));
+
     switch (type) {
       case cc_type_normal:
         break;
-      case cc_type_super:
-        *(VALUE *)&cc->flags |= VM_CALLCACHE_SUPER;
+      case cc_type_orphan:
+        *(VALUE *)&cc->flags |= VM_CALLCACHE_ORPHAN;
         break;
       case cc_type_refinement:
         *(VALUE *)&cc->flags |= VM_CALLCACHE_REFINEMENT;
@@ -350,9 +360,9 @@ vm_cc_new(VALUE klass,
 }
 
 static inline bool
-vm_cc_super_p(const struct rb_callcache *cc)
+vm_cc_orphan_p(const struct rb_callcache *cc)
 {
-    return (cc->flags & VM_CALLCACHE_SUPER) != 0;
+    return (cc->flags & VM_CALLCACHE_ORPHAN) != 0;
 }
 
 static inline bool
diff --git a/vm_core.h b/vm_core.h
index 2a9d5f906f101f..cc6454daace4a3 100644
--- a/vm_core.h
+++ b/vm_core.h
@@ -418,6 +418,7 @@ struct rb_iseq_constant_body {
             unsigned int ruby2_keywords: 1;
             unsigned int anon_rest: 1;
             unsigned int anon_kwrest: 1;
+            unsigned int forwardable: 1;
         } flags;
 
         unsigned int size;
diff --git a/vm_insnhelper.c b/vm_insnhelper.c
index fe038e86181aaa..3d34560156ad4a 100644
--- a/vm_insnhelper.c
+++ b/vm_insnhelper.c
@@ -2006,7 +2006,7 @@ vm_ccs_push(VALUE klass, struct rb_class_cc_entries *ccs, const struct rb_callin
     if (! vm_cc_markable(cc)) {
         return;
     }
-    else if (! vm_ci_markable(ci)) {
+    else if (! vm_ci_cacheable(ci)) {
         return;
     }
 
@@ -2054,11 +2054,12 @@ vm_ccs_verify(struct rb_class_cc_entries *ccs, ID mid, VALUE klass)
         const struct rb_callcache *cc = ccs->entries[i].cc;
 
         VM_ASSERT(vm_ci_p(ci));
+        VM_ASSERT(vm_ci_markable(ci));
         VM_ASSERT(vm_ci_mid(ci) == mid);
         VM_ASSERT(IMEMO_TYPE_P(cc, imemo_callcache));
         VM_ASSERT(vm_cc_class_check(cc, klass));
         VM_ASSERT(vm_cc_check_cme(cc, ccs->cme));
-        VM_ASSERT(!vm_cc_super_p(cc));
+        VM_ASSERT(!vm_cc_orphan_p(cc));
         VM_ASSERT(!vm_cc_refinement_p(cc));
     }
     return TRUE;
@@ -2153,7 +2154,7 @@ vm_search_cc(const VALUE klass, const struct rb_callinfo * const ci)
 
     cme = rb_check_overloaded_cme(cme, ci);
 
-    const struct rb_callcache *cc = vm_cc_new(klass, cme, vm_call_general, cc_type_normal);
+    const struct rb_callcache *cc = vm_cc_new(klass, cme, vm_call_general, vm_ci_cacheable(ci) ? cc_type_normal : cc_type_orphan);
     vm_ccs_push(klass, ccs, ci, cc);
 
     VM_ASSERT(vm_cc_cme(cc) != NULL);
@@ -2521,6 +2522,15 @@ vm_base_ptr(const rb_control_frame_t *cfp)
 
     if (cfp->iseq && VM_FRAME_RUBYFRAME_P(cfp)) {
         VALUE *bp = prev_cfp->sp + ISEQ_BODY(cfp->iseq)->local_table_size + VM_ENV_DATA_SIZE;
+
+        if (ISEQ_BODY(cfp->iseq)->param.flags.forwardable && VM_ENV_LOCAL_P(cfp->ep)) {
+            int lts = ISEQ_BODY(cfp->iseq)->local_table_size;
+            int params = ISEQ_BODY(cfp->iseq)->param.size;
+
+            CALL_INFO ci = (CALL_INFO)cfp->ep[-(VM_ENV_DATA_SIZE + (lts - params))]; // skip EP stuff, CI should be last local
+            bp += vm_ci_argc(ci);
+        }
+
         if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD || VM_FRAME_BMETHOD_P(cfp)) {
             /* adjust `self' */
             bp += 1;
@@ -2589,6 +2599,7 @@ rb_simple_iseq_p(const rb_iseq_t *iseq)
            ISEQ_BODY(iseq)->param.flags.has_kw == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
            ISEQ_BODY(iseq)->param.flags.accepts_no_kwarg == FALSE &&
+           ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
 }
 
@@ -2601,6 +2612,7 @@ rb_iseq_only_optparam_p(const rb_iseq_t *iseq)
            ISEQ_BODY(iseq)->param.flags.has_kw == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
            ISEQ_BODY(iseq)->param.flags.accepts_no_kwarg == FALSE &&
+           ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
 }
 
@@ -2612,6 +2624,7 @@ rb_iseq_only_kwparam_p(const rb_iseq_t *iseq)
            ISEQ_BODY(iseq)->param.flags.has_post == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_kw == TRUE &&
            ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
+           ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
            ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
 }
 
@@ -2972,7 +2985,51 @@ vm_callee_setup_arg(rb_execution_context_t *ec, struct rb_calling_info *calling,
 {
     const struct rb_callinfo *ci = calling->cd->ci;
     const struct rb_callcache *cc = calling->cc;
-    bool cacheable_ci = vm_ci_markable(ci);
+    bool cacheable_ci = vm_ci_cacheable(ci);
+
+    // Called iseq is using ... param
+    // def foo(...) # <- iseq for foo will have "forwardable"
+    //
+    // We want to set the `...` local to the caller's CI
+    //   foo(1, 2) # <- the ci for this should end up as `...`
+    //
+    // So hopefully the stack looks like:
+    //
+    //   => 1
+    //   => 2
+    //   => *
+    //   => **
+    //   => &
+    //   => ... # <- points at `foo`s CI
+    //   => cref_or_me
+    //   => specval
+    //   => type
+    //
+    if (ISEQ_BODY(iseq)->param.flags.forwardable) {
+        if ((vm_ci_flag(ci) & VM_CALL_FORWARDING)) {
+            struct rb_forwarding_call_data * forward_cd = (struct rb_forwarding_call_data *)calling->cd;
+            if (vm_ci_argc(ci) != vm_ci_argc(forward_cd->caller_ci)) {
+                ci = vm_ci_new_runtime(
+                        vm_ci_mid(ci),
+                        vm_ci_flag(ci),
+                        vm_ci_argc(ci),
+                        vm_ci_kwarg(ci));
+            } else {
+                ci = forward_cd->caller_ci;
+            }
+        }
+        // C functions calling iseqs will stack allocate a CI,
+        // so we need to convert it to heap allocated
+        if (!vm_ci_markable(ci)) {
+            ci = vm_ci_new_runtime(
+                    vm_ci_mid(ci),
+                    vm_ci_flag(ci),
+                    vm_ci_argc(ci),
+                    vm_ci_kwarg(ci));
+        }
+        argv[param_size - 1] = (VALUE)ci;
+        return 0;
+    }
 
     if (LIKELY(!(vm_ci_flag(ci) & VM_CALL_KW_SPLAT))) {
         if (LIKELY(rb_simple_iseq_p(iseq))) {
@@ -2984,7 +3041,7 @@ vm_callee_setup_arg(rb_execution_context_t *ec, struct rb_calling_info *calling,
                 argument_arity_error(ec, iseq, calling->argc, lead_num, lead_num);
             }
 
-            VM_ASSERT(ci == calling->cd->ci);
+            //VM_ASSERT(ci == calling->cd->ci);
             VM_ASSERT(cc == calling->cc);
 
             if (cacheable_ci && vm_call_iseq_optimizable_p(ci, cc)) {
@@ -3074,6 +3131,78 @@ vm_callee_setup_arg(rb_execution_context_t *ec, struct rb_calling_info *calling,
     return setup_parameters_complex(ec, iseq, calling, ci, argv, arg_setup_method);
 }
 
+static void
+vm_adjust_stack_forwarding(const struct rb_execution_context_struct *ec, struct rb_control_frame_struct *cfp, CALL_INFO callers_info, VALUE splat)
+{
+    // This case is when the caller is using a ... parameter.
+    // For example `bar(...)`. The call info will have VM_CALL_FORWARDING
+    // In this case the caller's caller's CI will be on the stack.
+    //
+    // For example:
+    //
+    // def bar(a, b); a + b; end
+    // def foo(...); bar(...); end
+    // foo(1, 2) # <- this CI will be on the stack when we call `bar(...)`
+    //
+    // Stack layout will be:
+    //
+    // > 1
+    // > 2
+    // > CI for foo(1, 2)
+    // > cref_or_me
+    // > specval
+    // > type
+    // > receiver
+    // > CI for foo(1, 2), via `getlocal ...`
+    // >      ( SP points here )
+    const VALUE * lep = VM_CF_LEP(cfp);
+
+    // We'll need to copy argc args to this SP
+    int argc = vm_ci_argc(callers_info);
+
+    const rb_iseq_t *iseq;
+
+    // If we're in an escaped environment (lambda for example), get the iseq
+    // from the captured env.
+    if (VM_ENV_FLAGS(lep, VM_ENV_FLAG_ESCAPED)) {
+        rb_env_t * env = (rb_env_t *)lep[VM_ENV_DATA_INDEX_ENV];
+        iseq = env->iseq;
+    }
+    else { // Otherwise use the lep to find the caller
+        iseq = rb_vm_search_cf_from_ep(ec, cfp, lep)->iseq;
+    }
+
+    // Our local storage is below the args we need to copy
+    int local_size = ISEQ_BODY(iseq)->local_table_size + argc;
+
+    const VALUE * from = lep - (local_size + VM_ENV_DATA_SIZE - 1); // 2 for EP values
+    VALUE * to = cfp->sp - 1; // clobber the CI
+
+    if (RTEST(splat)) {
+        to -= 1; // clobber the splat array
+        CHECK_VM_STACK_OVERFLOW0(cfp, to, RARRAY_LEN(splat));
+        MEMCPY(to, RARRAY_CONST_PTR(splat), VALUE, RARRAY_LEN(splat));
+        to += RARRAY_LEN(splat);
+    }
+
+    CHECK_VM_STACK_OVERFLOW0(cfp, to, argc);
+    MEMCPY(to, from, VALUE, argc);
+    cfp->sp = to + argc;
+
+    // Stack layout should now be:
+    //
+    // > 1
+    // > 2
+    // > CI for foo(1, 2)
+    // > cref_or_me
+    // > specval
+    // > type
+    // > receiver
+    // > 1
+    // > 2
+    // >      ( SP points here )
+}
+
 static VALUE
 vm_call_iseq_setup(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
 {
@@ -3081,8 +3210,15 @@ vm_call_iseq_setup(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct r
 
     const struct rb_callcache *cc = calling->cc;
     const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
-    const int param_size = ISEQ_BODY(iseq)->param.size;
-    const int local_size = ISEQ_BODY(iseq)->local_table_size;
+    int param_size = ISEQ_BODY(iseq)->param.size;
+    int local_size = ISEQ_BODY(iseq)->local_table_size;
+
+    // Setting up local size and param size
+    if (ISEQ_BODY(iseq)->param.flags.forwardable) {
+        local_size = local_size + vm_ci_argc(calling->cd->ci);
+        param_size = param_size + vm_ci_argc(calling->cd->ci);
+    }
+
     const int opt_pc = vm_callee_setup_arg(ec, calling, iseq, cfp->sp - calling->argc, param_size, local_size);
     return vm_call_iseq_setup_2(ec, cfp, calling, opt_pc, param_size, local_size);
 }
@@ -3592,7 +3728,7 @@ vm_call_cfunc_other(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, str
         return vm_call_cfunc_with_frame_(ec, reg_cfp, calling, argc, argv, stack_bottom);
     }
     else {
-        CC_SET_FASTPATH(calling->cc, vm_call_cfunc_with_frame, !rb_splat_or_kwargs_p(ci) && !calling->kw_splat);
+        CC_SET_FASTPATH(calling->cc, vm_call_cfunc_with_frame, !rb_splat_or_kwargs_p(ci) && !calling->kw_splat && !(vm_ci_flag(ci) & VM_CALL_FORWARDING));
 
         return vm_call_cfunc_with_frame(ec, reg_cfp, calling);
     }
@@ -4474,7 +4610,7 @@ vm_call_method_each_type(rb_execution_context_t *ec, rb_control_frame_t *cfp, st
 
         rb_check_arity(calling->argc, 1, 1);
 
-        const unsigned int aset_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT | VM_CALL_KWARG);
+        const unsigned int aset_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT | VM_CALL_KWARG | VM_CALL_FORWARDING);
 
         if (vm_cc_markable(cc)) {
             vm_cc_attr_index_initialize(cc, INVALID_SHAPE_ID);
@@ -4508,7 +4644,7 @@ vm_call_method_each_type(rb_execution_context_t *ec, rb_control_frame_t *cfp, st
         CALLER_SETUP_ARG(cfp, calling, ci, 0);
         rb_check_arity(calling->argc, 0, 0);
         vm_cc_attr_index_initialize(cc, INVALID_SHAPE_ID);
-        const unsigned int ivar_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT);
+        const unsigned int ivar_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT | VM_CALL_FORWARDING);
         VM_CALL_METHOD_ATTR(v,
                             vm_call_ivar(ec, cfp, calling),
                             CC_SET_FASTPATH(cc, vm_call_ivar, !(vm_ci_flag(ci) & ivar_mask)));
@@ -4727,13 +4863,18 @@ vm_search_super_method(const rb_control_frame_t *reg_cfp, struct rb_call_data *c
 
     ID mid = me->def->original_id;
 
-    // update iseq. really? (TODO)
-    cd->ci = vm_ci_new_runtime(mid,
-                               vm_ci_flag(cd->ci),
-                               vm_ci_argc(cd->ci),
-                               vm_ci_kwarg(cd->ci));
+    if (!vm_ci_markable(cd->ci)) {
+        VM_FORCE_WRITE((const VALUE *)&cd->ci->mid, (VALUE)mid);
+    }
+    else {
+        // update iseq. really? (TODO)
+        cd->ci = vm_ci_new_runtime(mid,
+                                   vm_ci_flag(cd->ci),
+                                   vm_ci_argc(cd->ci),
+                                   vm_ci_kwarg(cd->ci));
 
-    RB_OBJ_WRITTEN(reg_cfp->iseq, Qundef, cd->ci);
+        RB_OBJ_WRITTEN(reg_cfp->iseq, Qundef, cd->ci);
+    }
 
     const struct rb_callcache *cc;
 
@@ -4741,7 +4882,7 @@ vm_search_super_method(const rb_control_frame_t *reg_cfp, struct rb_call_data *c
 
     if (!klass) {
         /* bound instance method of module */
-        cc = vm_cc_new(klass, NULL, vm_call_method_missing, cc_type_super);
+        cc = vm_cc_new(klass, NULL, vm_call_method_missing, cc_type_orphan);
         RB_OBJ_WRITE(reg_cfp->iseq, &cd->cc, cc);
     }
     else {
@@ -4756,7 +4897,7 @@ vm_search_super_method(const rb_control_frame_t *reg_cfp, struct rb_call_data *c
         else if (cached_cme->called_id != mid) {
             const rb_callable_method_entry_t *cme = rb_callable_method_entry(klass, mid);
             if (cme) {
-                cc = vm_cc_new(klass, cme, vm_call_super_method, cc_type_super);
+                cc = vm_cc_new(klass, cme, vm_call_super_method, cc_type_orphan);
                 RB_OBJ_WRITE(reg_cfp->iseq, &cd->cc, cc);
             }
             else {
@@ -5668,8 +5809,20 @@ VALUE
 rb_vm_send(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd, ISEQ blockiseq)
 {
     stack_check(ec);
-    VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, false);
+
+    struct rb_forwarding_call_data adjusted_cd;
+    struct rb_callinfo adjusted_ci;
+    CALL_DATA _cd = cd;
+
+    VALUE bh = vm_caller_setup_args(ec, GET_CFP(), &cd, blockiseq, false, &adjusted_cd, &adjusted_ci);
     VALUE val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
+
+    if (vm_ci_flag(_cd->ci) & VM_CALL_FORWARDING) {
+      if (_cd->cc != cd->cc && vm_cc_markable(cd->cc)) {
+          RB_OBJ_WRITE(GET_ISEQ(), &_cd->cc, cd->cc);
+      }
+    }
+
     VM_EXEC(ec, val);
     return val;
 }
@@ -5688,8 +5841,19 @@ VALUE
 rb_vm_invokesuper(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd, ISEQ blockiseq)
 {
     stack_check(ec);
-    VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, true);
+    struct rb_forwarding_call_data adjusted_cd;
+    struct rb_callinfo adjusted_ci;
+    CALL_DATA _cd = cd;
+
+    VALUE bh = vm_caller_setup_args(ec, GET_CFP(), &cd, blockiseq, true, &adjusted_cd, &adjusted_ci);
     VALUE val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_super);
+
+    if (vm_ci_flag(_cd->ci) & VM_CALL_FORWARDING) {
+      if (_cd->cc != cd->cc && vm_cc_markable(cd->cc)) {
+          RB_OBJ_WRITE(GET_ISEQ(), &_cd->cc, cd->cc);
+      }
+    }
+
     VM_EXEC(ec, val);
     return val;
 }
diff --git a/vm_insnhelper.h b/vm_insnhelper.h
index 286bc1f6714b68..52d0a0cacc2cc4 100644
--- a/vm_insnhelper.h
+++ b/vm_insnhelper.h
@@ -58,6 +58,11 @@ RUBY_EXTERN rb_serial_t ruby_vm_global_cvar_state;
     VM_REG_CFP = ec->cfp; \
 } while (0)
 
+struct rb_forwarding_call_data {
+    struct rb_call_data cd;
+    CALL_INFO caller_ci;
+};
+
 #if VM_COLLECT_USAGE_DETAILS
 enum vm_regan_regtype {
     VM_REGAN_PC = 0,
diff --git a/vm_method.c b/vm_method.c
index 5ca302a86a8c75..ac7d8c84ac2067 100644
--- a/vm_method.c
+++ b/vm_method.c
@@ -1184,6 +1184,7 @@ rb_check_overloaded_cme(const rb_callable_method_entry_t *cme, const struct rb_c
 {
     if (UNLIKELY(cme->def->iseq_overload) &&
         (vm_ci_flag(ci) & (VM_CALL_ARGS_SIMPLE)) &&
+        (!(vm_ci_flag(ci) & VM_CALL_FORWARDING)) &&
         (int)vm_ci_argc(ci) == ISEQ_BODY(method_entry_iseqptr(cme))->param.lead_num) {
         VM_ASSERT(cme->def->type == VM_METHOD_TYPE_ISEQ); // iseq_overload is marked only on ISEQ methods
 
diff --git a/yjit.c b/yjit.c
index ffd180429e4485..5f1626f6465a0d 100644
--- a/yjit.c
+++ b/yjit.c
@@ -695,6 +695,12 @@ rb_get_iseq_flags_accepts_no_kwarg(const rb_iseq_t *iseq)
     return iseq->body->param.flags.accepts_no_kwarg;
 }
 
+bool
+rb_get_iseq_flags_forwardable(const rb_iseq_t *iseq)
+{
+    return iseq->body->param.flags.forwardable;
+}
+
 const rb_seq_param_keyword_struct *
 rb_get_iseq_body_param_keyword(const rb_iseq_t *iseq)
 {
diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs
index c58df7c3773b90..cde4614ca919db 100644
--- a/yjit/bindgen/src/main.rs
+++ b/yjit/bindgen/src/main.rs
@@ -426,6 +426,7 @@ fn main() {
         .allowlist_function("rb_get_iseq_flags_ambiguous_param0")
         .allowlist_function("rb_get_iseq_flags_accepts_no_kwarg")
         .allowlist_function("rb_get_iseq_flags_ruby2_keywords")
+        .allowlist_function("rb_get_iseq_flags_forwardable")
         .allowlist_function("rb_get_iseq_body_local_table_size")
         .allowlist_function("rb_get_iseq_body_param_keyword")
         .allowlist_function("rb_get_iseq_body_param_size")
diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs
index d60f4f0ec1029b..cd7f9b7938b2ee 100644
--- a/yjit/src/codegen.rs
+++ b/yjit/src/codegen.rs
@@ -6910,6 +6910,8 @@ fn gen_send_iseq(
     let kw_splat = flags & VM_CALL_KW_SPLAT != 0;
     let splat_call = flags & VM_CALL_ARGS_SPLAT != 0;
 
+    let forwarding_call = unsafe { rb_get_iseq_flags_forwardable(iseq) };
+
     // For computing offsets to callee locals
     let num_params = unsafe { get_iseq_body_param_size(iseq) as i32 };
     let num_locals = unsafe { get_iseq_body_local_table_size(iseq) as i32 };
@@ -6942,6 +6944,12 @@ fn gen_send_iseq(
     // Stack index of the splat array
     let splat_pos = i32::from(block_arg) + i32::from(kw_splat) + kw_arg_num;
 
+    // Dynamic stack layout. No good way to support without inlining.
+    if flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(asm, Counter::send_iseq_forwarding);
+        return None;
+    }
+
     exit_if_stack_too_large(iseq)?;
     exit_if_tail_call(asm, ci)?;
     exit_if_has_post(asm, iseq)?;
@@ -6952,7 +6960,9 @@ fn gen_send_iseq(
     exit_if_supplying_kw_and_has_no_kw(asm, supplying_kws, doing_kw_call)?;
     exit_if_supplying_kws_and_accept_no_kwargs(asm, supplying_kws, iseq)?;
     exit_if_doing_kw_and_splat(asm, doing_kw_call, flags)?;
-    exit_if_wrong_number_arguments(asm, arg_setup_block, opts_filled, flags, opt_num, iseq_has_rest)?;
+    if !forwarding_call {
+        exit_if_wrong_number_arguments(asm, arg_setup_block, opts_filled, flags, opt_num, iseq_has_rest)?;
+    }
     exit_if_doing_kw_and_opts_missing(asm, doing_kw_call, opts_missing)?;
     exit_if_has_rest_and_optional_and_block(asm, iseq_has_rest, opt_num, iseq, block_arg)?;
     let block_arg_type = exit_if_unsupported_block_arg_type(jit, asm, block_arg)?;
@@ -7463,6 +7473,7 @@ fn gen_send_iseq(
         }
     }
 
+    if !forwarding_call {
     // Nil-initialize missing optional parameters
     nil_fill(
         "nil-initialize missing optionals",
@@ -7490,6 +7501,13 @@ fn gen_send_iseq(
         },
         asm
     );
+    }
+
+    if forwarding_call {
+        dbg!(get_iseq_name(iseq));
+        assert_eq!(1, num_params);
+        asm.mov(asm.stack_opnd(-1), VALUE(ci as usize).into());
+    }
 
     // Points to the receiver operand on the stack unless a captured environment is used
     let recv = match captured_opnd {
@@ -7508,7 +7526,13 @@ fn gen_send_iseq(
     jit_save_pc(jit, asm);
 
     // Adjust the callee's stack pointer
-    let callee_sp = asm.lea(asm.ctx.sp_opnd(-argc + num_locals + VM_ENV_DATA_SIZE as i32));
+    let callee_sp = if forwarding_call {
+        let offs = num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    } else {
+        let offs = -argc + num_locals + VM_ENV_DATA_SIZE as i32;
+        asm.lea(asm.ctx.sp_opnd(offs))
+    };
 
     let specval = if let Some(prev_ep) = prev_ep {
         // We've already side-exited if the callee expects a block, so we
@@ -9057,6 +9081,10 @@ fn gen_invokesuper_specialized(
         gen_counter_incr(asm, Counter::invokesuper_kw_splat);
         return None;
     }
+    if ci_flags & VM_CALL_FORWARDING != 0 {
+        gen_counter_incr(asm, Counter::invokesuper_forwarding);
+        return None;
+    }
 
     // Ensure we haven't rebound this method onto an incompatible class.
     // In the interpreter we try to avoid making this check by performing some
diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs
index 9547e3fa2cc67b..362b0d66aea979 100644
--- a/yjit/src/cruby.rs
+++ b/yjit/src/cruby.rs
@@ -714,6 +714,7 @@ mod manual_defs {
     pub const VM_CALL_ARGS_SIMPLE: u32 = 1 << VM_CALL_ARGS_SIMPLE_bit;
     pub const VM_CALL_ARGS_SPLAT: u32 = 1 << VM_CALL_ARGS_SPLAT_bit;
     pub const VM_CALL_ARGS_BLOCKARG: u32 = 1 << VM_CALL_ARGS_BLOCKARG_bit;
+    pub const VM_CALL_FORWARDING: u32 = 1 << VM_CALL_FORWARDING_bit;
     pub const VM_CALL_FCALL: u32 = 1 << VM_CALL_FCALL_bit;
     pub const VM_CALL_KWARG: u32 = 1 << VM_CALL_KWARG_bit;
     pub const VM_CALL_KW_SPLAT: u32 = 1 << VM_CALL_KW_SPLAT_bit;
diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs
index 359227d60db211..0e1b55cb46c4c4 100644
--- a/yjit/src/cruby_bindings.inc.rs
+++ b/yjit/src/cruby_bindings.inc.rs
@@ -664,7 +664,8 @@ pub const VM_CALL_ZSUPER_bit: vm_call_flag_bits = 9;
 pub const VM_CALL_OPT_SEND_bit: vm_call_flag_bits = 10;
 pub const VM_CALL_KW_SPLAT_MUT_bit: vm_call_flag_bits = 11;
 pub const VM_CALL_ARGS_SPLAT_MUT_bit: vm_call_flag_bits = 12;
-pub const VM_CALL__END: vm_call_flag_bits = 13;
+pub const VM_CALL_FORWARDING_bit: vm_call_flag_bits = 13;
+pub const VM_CALL__END: vm_call_flag_bits = 14;
 pub type vm_call_flag_bits = u32;
 #[repr(C)]
 pub struct rb_callinfo_kwarg {
@@ -1164,6 +1165,7 @@ extern "C" {
     pub fn rb_get_iseq_flags_has_block(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_ambiguous_param0(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_flags_accepts_no_kwarg(iseq: *const rb_iseq_t) -> bool;
+    pub fn rb_get_iseq_flags_forwardable(iseq: *const rb_iseq_t) -> bool;
     pub fn rb_get_iseq_body_param_keyword(
         iseq: *const rb_iseq_t,
     ) -> *const rb_seq_param_keyword_struct;
diff --git a/yjit/src/stats.rs b/yjit/src/stats.rs
index 0a63fab8b049ae..2a64d8bd3c5997 100644
--- a/yjit/src/stats.rs
+++ b/yjit/src/stats.rs
@@ -376,6 +376,7 @@ make_counters! {
     send_iseq_block_arg_type,
     send_iseq_clobbering_block_arg,
     send_iseq_complex_discard_extras,
+    send_iseq_forwarding,
     send_iseq_leaf_builtin_block_arg_block_param,
     send_iseq_kw_splat_non_nil,
     send_iseq_kwargs_mismatch,
@@ -412,6 +413,7 @@ make_counters! {
     send_optimized_block_arg,
 
     invokesuper_defined_class_mismatch,
+    invokesuper_forwarding,
     invokesuper_kw_splat,
     invokesuper_kwarg,
     invokesuper_megamorphic,