diff --git a/plutus-core/changelog.d/20240523_124004_koz.ross_bitwise_2.md b/plutus-core/changelog.d/20240523_124004_koz.ross_bitwise_2.md
new file mode 100644
index 00000000000..d4cded39098
--- /dev/null
+++ b/plutus-core/changelog.d/20240523_124004_koz.ross_bitwise_2.md
@@ -0,0 +1,39 @@
+<!--
+A new scriv changelog fragment.
+
+Uncomment the section that is right (remove the HTML comment wrapper).
+-->
+
+<!--
+### Removed
+
+- A bullet item for the Removed category.
+
+-->
+### Added
+
+- Implementation and tests for primitive operations in [this
+  CIP](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-XXXX/CIP-XXXX.md)
+
+### Changed
+
+- Rename `ReplicateByteString` to `ReplicateByte` (and similarly for denotation)
+
+<!--
+### Deprecated
+
+- A bullet item for the Deprecated category.
+
+-->
+<!--
+### Fixed
+
+- A bullet item for the Fixed category.
+
+-->
+<!--
+### Security
+
+- A bullet item for the Security category.
+
+-->
diff --git a/plutus-core/plutus-core.cabal b/plutus-core/plutus-core.cabal
index 91948b911c1..02313c2b80e 100644
--- a/plutus-core/plutus-core.cabal
+++ b/plutus-core/plutus-core.cabal
@@ -92,8 +92,7 @@ library
     PlutusCore.Analysis.Definitions
     PlutusCore.Annotation
     PlutusCore.Arity
-    PlutusCore.Bitwise.Convert
-    PlutusCore.Bitwise.Logical
+    PlutusCore.Bitwise
     PlutusCore.Builtin
     PlutusCore.Builtin.Debug
     PlutusCore.Builtin.Elaborate
@@ -417,6 +416,7 @@ test-suite untyped-plutus-core-test
     DeBruijn.Spec
     DeBruijn.UnDeBruijnify
     Evaluation.Builtins
+    Evaluation.Builtins.Bitwise
     Evaluation.Builtins.BLS12_381
     Evaluation.Builtins.BLS12_381.TestClasses
     Evaluation.Builtins.BLS12_381.Utils
@@ -430,6 +430,7 @@ test-suite untyped-plutus-core-test
     Evaluation.Debug
     Evaluation.FreeVars
     Evaluation.Golden
+    Evaluation.Helpers
     Evaluation.Machines
     Evaluation.Regressions
     Flat.Spec
diff --git a/plutus-core/plutus-core/src/PlutusCore/Bitwise.hs b/plutus-core/plutus-core/src/PlutusCore/Bitwise.hs
new file mode 100644
index 00000000000..7ffae07dbf6
--- /dev/null
+++ b/plutus-core/plutus-core/src/PlutusCore/Bitwise.hs
@@ -0,0 +1,1168 @@
+-- editorconfig-checker-disable-file
+
+{-# LANGUAGE BangPatterns      #-}
+{-# LANGUAGE MagicHash         #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+-- | Implementations for CIP-121, CIP-122 and CIP-123. Grouped because they all operate on
+-- 'ByteString's, and require similar functionality.
+module PlutusCore.Bitwise (
+  -- * Wrappers
+  integerToByteStringWrapper,
+  byteStringToIntegerWrapper,
+  -- * Implementation details
+  IntegerToByteStringError (..),
+  integerToByteStringMaximumOutputLength,
+  integerToByteString,
+  byteStringToInteger,
+  andByteString,
+  orByteString,
+  xorByteString,
+  complementByteString,
+  readBit,
+  writeBits,
+  replicateByte,
+  shiftByteString,
+  rotateByteString,
+  countSetBits,
+  findFirstSetBit
+  ) where
+
+import PlutusCore.Builtin (BuiltinResult, emit)
+import PlutusCore.Evaluation.Result (evaluationFailure)
+
+import ByteString.StrictBuilder (Builder)
+import ByteString.StrictBuilder qualified as Builder
+import Control.Exception (Exception, throw, try)
+import Control.Monad (guard, unless, when)
+import Data.Bits (unsafeShiftL, unsafeShiftR, (.|.))
+import Data.Bits qualified as Bits
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.ByteString.Internal qualified as BSI
+import Data.Foldable (for_, traverse_)
+import Data.Text (pack)
+import Data.Word (Word64, Word8)
+import Foreign.Marshal.Utils (copyBytes, fillBytes)
+import Foreign.Ptr (Ptr, castPtr, plusPtr)
+import Foreign.Storable (peekByteOff, peekElemOff, pokeByteOff, pokeElemOff)
+import GHC.ByteOrder (ByteOrder (BigEndian, LittleEndian))
+import GHC.Exts (Int (I#))
+import GHC.Integer.Logarithms (integerLog2#)
+import GHC.IO.Unsafe (unsafeDupablePerformIO)
+
+{- Note [Input length limitation for IntegerToByteString].  We make
+   `integerToByteString` fail if it is called with arguments which would cause
+   the length of the result to exceed about 8K bytes because the execution time
+   becomes difficult to predict accurately beyond this point (benchmarks on a
+   number of different machines show that the CPU time increases smoothly for
+   inputs up to about 8K then increases sharply, becoming chaotic after about
+   14K).  This restriction may be removed once a more efficient implementation
+   becomes available, which may happen when we no longer have to support GHC
+   8.10. -}
+{- NB: if we do relax the length restriction then we will need two variants of
+   integerToByteString in Plutus Core so that we can continue to support the
+   current behaviour for old scripts.-}
+integerToByteStringMaximumOutputLength :: Integer
+integerToByteStringMaximumOutputLength = 8192
+
+{- Return the base 2 logarithm of an integer, returning 0 for inputs that aren't
+   strictly positive.  This is essentially copied from GHC.Num.Integer, which
+   has integerLog2 but only in GHC >= 9.0. We should use the library function
+   instead when we stop supporting 8.10. -}
+integerLog2 :: Integer -> Int
+integerLog2 !i = I# (integerLog2# i)
+
+-- | Wrapper for 'integerToByteString' to make it more convenient to define as a builtin.
+integerToByteStringWrapper :: Bool -> Integer -> Integer -> BuiltinResult ByteString
+integerToByteStringWrapper endiannessArg lengthArg input
+  -- Check that the length is non-negative.
+  | lengthArg < 0 = do
+      emit "integerToByteString: negative length argument"
+      emit $ "Length requested: " <> (pack . show $ input)
+      evaluationFailure
+  -- Check that the requested length does not exceed the limit.  *NB*: if we remove the limit we'll
+  -- still have to make sure that the length fits into an Int.
+  | lengthArg > integerToByteStringMaximumOutputLength = do
+      emit . pack $ "integerToByteString: requested length is too long (maximum is "
+               ++ show integerToByteStringMaximumOutputLength
+               ++ " bytes)"
+      emit $ "Length requested: " <> (pack . show $ lengthArg)
+      evaluationFailure
+  -- If the requested length is zero (ie, an explicit output size is not
+  -- specified) we still have to make sure that the output won't exceed the size
+  -- limit.  If the requested length is nonzero and less than the limit,
+  -- integerToByteString checks that the input fits.
+  | lengthArg == 0 -- integerLog2 n is one less than the number of significant bits in n
+       && fromIntegral (integerLog2 input) >= 8 * integerToByteStringMaximumOutputLength =
+    let bytesRequiredFor n = integerLog2 n `div` 8 + 1
+        -- ^ This gives 1 instead of 0 for n=0, but we'll never get that.
+    in do
+      emit . pack $ "integerToByteString: input too long (maximum is 2^"
+               ++ show (8 * integerToByteStringMaximumOutputLength)
+               ++ "-1)"
+      emit $ "Length required: " <> (pack . show $ bytesRequiredFor input)
+      evaluationFailure
+  | otherwise = let endianness = endiannessArgToByteOrder endiannessArg in
+    -- We use fromIntegral here, despite advice to the contrary in general when defining builtin
+    -- denotations. This is because, if we've made it this far, we know that overflow or truncation
+    -- are impossible: we've checked that whatever we got given fits inside a (non-negative) Int.
+    case integerToByteString endianness (fromIntegral lengthArg) input of
+      Left err -> case err of
+        NegativeInput -> do
+          emit "integerToByteString: cannot convert negative Integer"
+          -- This does work proportional to the size of input. However, we're in a failing case
+          -- anyway, and the user's paid for work proportional to this size in any case.
+          emit $ "Input: " <> (pack . show $ input)
+          evaluationFailure
+        NotEnoughDigits -> do
+          emit "integerToByteString: cannot represent Integer in given number of bytes"
+          -- This does work proportional to the size of input. However, we're in a failing case
+          -- anyway, and the user's paid for work proportional to this size in any case.
+          emit $ "Input: " <> (pack . show $ input)
+          emit $ "Bytes requested: " <> (pack . show $ lengthArg)
+          evaluationFailure
+      Right result -> pure result
+
+-- | Wrapper for 'byteStringToInteger' to make it more convenient to define as a builtin.
+byteStringToIntegerWrapper ::
+  Bool -> ByteString -> Integer
+byteStringToIntegerWrapper statedEndiannessArg input =
+  let endianness = endiannessArgToByteOrder statedEndiannessArg in
+    byteStringToInteger endianness input
+
+-- | Structured type to help indicate conversion errors.
+data IntegerToByteStringError =
+  NegativeInput |
+  NotEnoughDigits
+  deriving stock (Eq, Show)
+
+-- | Conversion from 'Integer' to 'ByteString', as per
+-- [CIP-121](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0121).
+--
+-- For performance and clarity, the endianness argument uses
+-- 'ByteOrder', and the length argument is an 'Int'.
+integerToByteString :: ByteOrder -> Int -> Integer -> Either IntegerToByteStringError ByteString
+integerToByteString requestedByteOrder requestedLength input
+  | input < 0 = Left NegativeInput
+  | input == 0 = Right . BS.replicate requestedLength $ 0x00
+  -- We use manual specialization to ensure as few branches in loop bodies as
+  -- we can. See Note [Manual specialization] for details.
+  | requestedLength == 0 = Right . Builder.builderBytes $ case requestedByteOrder of
+      LittleEndian -> goLENoLimit mempty input
+      BigEndian    -> goBENoLimit mempty input
+  | otherwise = do
+      let result = case requestedByteOrder of
+                    LittleEndian -> goLELimit mempty input
+                    BigEndian    -> goBELimit mempty input
+      case result of
+        Nothing -> Left NotEnoughDigits
+        Just b  -> Right . Builder.builderBytes $ b
+  where
+    goLELimit :: Builder -> Integer -> Maybe Builder
+    goLELimit acc remaining
+      | remaining == 0 = pure $ padLE acc
+      | otherwise = do
+          -- builderLength is constant time, so we don't track the length ourselves
+          guard (Builder.builderLength acc < requestedLength)
+          -- This allows extracting eight digits at once. See Note [Loop sectioning] for details on
+          -- why we do this. We also duplicate this code in several places: see Note [Manual
+          -- specialization] for why.
+          --
+          -- The code is basically equivalent to remaining `quotRem` 2^64, but more efficient. This
+          -- is for two reasons: firstly, GHC does not optimize divisions into shifts for Integer
+          -- (even if the divisor is constant), and secondly, the pair generated by `quotRem` costs
+          -- us as much as 15% peformance, and GHC seems unable to eliminate it. Thus, we have to do
+          -- it like this instead.
+          let newRemaining = remaining `unsafeShiftR` 64
+          -- Given that remaining must be non-negative, fromInteger here effectively truncates to a
+          -- Word64, by retaining only the least-significant 8 bytes.
+          let digitGroup :: Word64 = fromInteger remaining
+          case newRemaining of
+            0 -> finishLELimit acc digitGroup
+            _ -> goLELimit (acc <> Builder.storable digitGroup) newRemaining
+    finishLELimit :: Builder -> Word64 -> Maybe Builder
+    finishLELimit acc remaining
+      | remaining == 0 = pure $ padLE acc
+      | otherwise = do
+          guard (Builder.builderLength acc < requestedLength)
+          -- This is equivalent to 'remaining `quotRem` 256' followed by a conversion of the
+          -- remainder, but faster. This is similar to the larger example above, and we do it for
+          -- the same reasons.
+          let newRemaining = remaining `unsafeShiftR` 8
+          let digit :: Word8 = fromIntegral remaining
+          finishLELimit (acc <> Builder.word8 digit) newRemaining
+    -- By separating the case where we don't need to concern ourselves with a
+    -- user-specified limit, we can avoid branching needlessly, or doing a
+    -- complex expression check on every loop. See Note [Manual specialization]
+    -- for why this matters.
+    goLENoLimit :: Builder -> Integer -> Builder
+    goLENoLimit acc remaining
+      | remaining == 0 = acc
+      | otherwise = let newRemaining = remaining `unsafeShiftR` 64
+                        digitGroup :: Word64 = fromInteger remaining
+                      in case newRemaining of
+                        0 -> finishLENoLimit acc digitGroup
+                        _ -> goLENoLimit (acc <> Builder.storable digitGroup) newRemaining
+    finishLENoLimit :: Builder -> Word64 -> Builder
+    finishLENoLimit acc remaining
+      | remaining == 0 = acc
+      | otherwise =
+          let newRemaining = remaining `unsafeShiftR` 8
+              digit :: Word8 = fromIntegral remaining
+           in finishLENoLimit (acc <> Builder.word8 digit) newRemaining
+    padLE :: Builder -> Builder
+    padLE acc = let paddingLength = requestedLength - Builder.builderLength acc
+      in acc <> Builder.bytes (BS.replicate paddingLength 0x0)
+    -- We manually specialize the big-endian case: see Note [Manual specialization] for why.
+    goBELimit :: Builder -> Integer -> Maybe Builder
+    goBELimit acc remaining
+      | remaining == 0 = pure $ padBE acc
+      | otherwise = do
+          guard (Builder.builderLength acc < requestedLength)
+          let newRemaining = remaining `unsafeShiftR` 64
+          let digitGroup :: Word64 = fromInteger remaining
+          case newRemaining of
+            0 -> finishBELimit acc digitGroup
+            _ -> goBELimit (Builder.word64BE digitGroup <> acc) newRemaining
+    finishBELimit :: Builder -> Word64 -> Maybe Builder
+    finishBELimit acc remaining
+      | remaining == 0 = pure $ padBE acc
+      | otherwise = do
+          guard (Builder.builderLength acc < requestedLength)
+          let newRemaining = remaining `unsafeShiftR` 8
+          let digit = fromIntegral remaining
+          finishBELimit (Builder.word8 digit <> acc) newRemaining
+    goBENoLimit :: Builder -> Integer -> Builder
+    goBENoLimit acc remaining
+      | remaining == 0 = acc
+      | otherwise = let newRemaining = remaining `unsafeShiftR` 64
+                        digitGroup = fromInteger remaining
+                      in case newRemaining of
+                        0 -> finishBENoLimit acc digitGroup
+                        _ -> goBENoLimit (Builder.word64BE digitGroup <> acc) newRemaining
+    finishBENoLimit :: Builder -> Word64 -> Builder
+    finishBENoLimit acc remaining
+      | remaining == 0 = acc
+      | otherwise = let newRemaining = remaining `unsafeShiftR` 8
+                        digit = fromIntegral remaining
+                      in finishBENoLimit (Builder.word8 digit <> acc) newRemaining
+    padBE :: Builder -> Builder
+    padBE acc = let paddingLength = requestedLength - Builder.builderLength acc in
+      Builder.bytes (BS.replicate paddingLength 0x0) <> acc
+
+-- | Conversion from 'ByteString' to 'Integer', as per
+-- [CIP-121](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0121).
+--
+-- For clarity, the stated endianness argument uses 'ByteOrder'.
+byteStringToInteger :: ByteOrder -> ByteString -> Integer
+  -- We use manual specialization to ensure as few branches in loop bodies as we can. See Note
+  -- [Manual specialization] for details.
+byteStringToInteger statedByteOrder input = case statedByteOrder of
+    -- Since padding bytes in the most-significant-last representation go at
+    -- the end of the input, we can skip decoding them, as they won't affect
+    -- the result in any way.
+    LittleEndian -> case BS.findIndexEnd (/= 0x00) input of
+      -- If there are no nonzero bytes, it must be zero.
+      Nothing  -> 0
+      Just end -> goLE 0 end 0
+    -- Since padding bytes in the most-significant-first representation go at
+    -- the beginning of the input, we can skip decoding them, as they won't
+    -- affect the result in any way.
+    BigEndian -> case BS.findIndex (/= 0x00) input of
+      Nothing  -> 0
+      Just end -> goBE 0 end 0 (BS.length input - 1)
+  where
+    -- Like with toByteString, we use loop sectioning to decode eight digits at once. See Note [Loop
+    -- sectioning] for why we do this.
+    goLE :: Integer -> Int -> Int -> Integer
+    goLE acc limit ix
+      | ix <= (limit - 7) =
+          let digitGroup = read64LE ix
+              -- Same as ix * 8, but faster. GHC might already do this optimization, but we may as
+              -- well be sure.
+              shift = ix `unsafeShiftL` 3
+              newIx = ix + 8
+              -- We use unsafeShiftL to move a group of eight digits into the right position in
+              -- the result, then combine with the accumulator. This is equivalent to a
+              -- multiplication by 2^64*k, but significantly faster, as GHC doesn't optimize
+              -- such multiplications into shifts for Integers.
+              newAcc = acc + fromIntegral digitGroup `unsafeShiftL` shift
+            in goLE newAcc limit newIx
+      | otherwise = finishLE acc limit ix
+    finishLE :: Integer -> Int -> Int -> Integer
+    finishLE acc limit ix
+      | ix > limit = acc
+      | otherwise =
+          let digit = BS.index input ix
+              shift = ix `unsafeShiftL` 3
+              newIx = ix + 1
+              -- Similarly to before, we use unsafeShiftL to move a single digit into the right
+              -- position in the result.
+              newAcc = acc + fromIntegral digit `unsafeShiftL` shift
+            in finishLE newAcc limit newIx
+    -- Technically, ByteString does not allow reading of anything bigger than a single byte.
+    -- However, because ByteStrings are counted arrays, caching already brings in adjacent bytes,
+    -- which makes fetching them quite cheap. Additionally, GHC appears to optimize this into a
+    -- block read of 64 bits at once, which saves memory movement. See Note [Superscalarity and
+    -- caching] for details of why this matters.
+    read64LE :: Int -> Word64
+    read64LE startIx =
+      fromIntegral (BS.index input startIx)
+        .|. (fromIntegral (BS.index input (startIx + 1)) `unsafeShiftL` 8)
+        .|. (fromIntegral (BS.index input (startIx + 2)) `unsafeShiftL` 16)
+        .|. (fromIntegral (BS.index input (startIx + 3)) `unsafeShiftL` 24)
+        .|. (fromIntegral (BS.index input (startIx + 4)) `unsafeShiftL` 32)
+        .|. (fromIntegral (BS.index input (startIx + 5)) `unsafeShiftL` 40)
+        .|. (fromIntegral (BS.index input (startIx + 6)) `unsafeShiftL` 48)
+        .|. (fromIntegral (BS.index input (startIx + 7)) `unsafeShiftL` 56)
+    -- We manually specialize the big-endian cases: see Note [Manual specialization] for why.
+    --
+    -- In the big-endian case, shifts and indexes change in different ways: indexes start _high_
+    -- and _reduce_, but shifts start _low_ and rise. This is different to the little-endian case,
+    -- where both start low and rise. Thus, we track the index and shift separately in the
+    -- big-endian case: it makes the adjustments easier, and doesn't really change anything, as if
+    -- we wanted to compute the shift, we'd have to pass an offset argument anyway.
+    goBE :: Integer -> Int -> Int -> Int -> Integer
+    goBE acc limit shift ix
+      | ix >= (limit + 7) =
+          let digitGroup = read64BE ix
+              newShift = shift + 64
+              newIx = ix - 8
+              newAcc = acc + fromIntegral digitGroup `unsafeShiftL` shift
+           in goBE newAcc limit newShift newIx
+      | otherwise = finishBE acc limit shift ix
+    finishBE :: Integer -> Int -> Int -> Int -> Integer
+    finishBE acc limit shift ix
+      | ix < limit = acc
+      | otherwise =
+          let digit = BS.index input ix
+              newShift = shift + 8
+              newIx = ix - 1
+              newAcc = acc + fromIntegral digit `unsafeShiftL` shift
+           in finishBE newAcc limit newShift newIx
+    read64BE :: Int -> Word64
+    read64BE endIx =
+      fromIntegral (BS.index input endIx)
+        .|. (fromIntegral (BS.index input (endIx - 1)) `unsafeShiftL` 8)
+        .|. (fromIntegral (BS.index input (endIx - 2)) `unsafeShiftL` 16)
+        .|. (fromIntegral (BS.index input (endIx - 3)) `unsafeShiftL` 24)
+        .|. (fromIntegral (BS.index input (endIx - 4)) `unsafeShiftL` 32)
+        .|. (fromIntegral (BS.index input (endIx - 5)) `unsafeShiftL` 40)
+        .|. (fromIntegral (BS.index input (endIx - 6)) `unsafeShiftL` 48)
+        .|. (fromIntegral (BS.index input (endIx - 7)) `unsafeShiftL` 56)
+
+endiannessArgToByteOrder :: Bool -> ByteOrder
+endiannessArgToByteOrder b = if b then BigEndian else LittleEndian
+
+{- Note [Binary bitwise operation implementation and manual specialization]
+
+   All of the 'binary' bitwise operations (namely `andByteString`,
+   `orByteString` and `xorByteString`) operate similarly:
+
+   1. Decide which of their two `ByteString` arguments determines the length
+      of the result. For padding semantics, this is the _longer_ argument,
+      whereas for truncation semantics, it's the _shorter_ one. If both
+      `ByteString` arguments have identical length, it doesn't matter which we
+      choose.
+   2. Copy the choice made in step 1 into a fresh mutable buffer.
+   3. Traverse over each byte of the argument _not_ chosen in step 1, and
+      combine each of those bytes with the byte at the corresponding index of
+      the fresh mutable buffer from step 2 (`.&.` for `andByteString`,
+      `.|.` for `orByteString`, `xor` for `xorByteString`).
+
+  We also make use of loop sectioning to optimize this operation: see Note
+  [Loop sectioning] explaining why we do this. Fundamentally, this doesn't
+  change the logic of the operation, but means that step 3 is split into
+  two smaller sub-steps: we first word 8 bytes at a time, then one byte at a
+  time to finish up if necessary. Other than the choice of 'combining
+  operation', the structure of the computation is the same, which suggests that
+  we want a helper function with a signature like
+
+  helper1 ::
+    (Word64 -> Word64 -> Word64) ->
+    (Word8 -> Word8 -> Word8) ->
+    ByteString ->
+    ByteString ->
+    Int ->
+    ByteString
+
+  or possibly (to avoid duplicate argument passing) like
+
+  helper2 ::
+    (forall (a :: Type) . Bits a => a -> a -> a) ->
+    ByteString ->
+    ByteString ->
+    Int ->
+    ByteString
+
+  This would allow us to share all this logic, and have each of the 'top-level'
+  operations just dispatch to either of the helpers with the appropriate
+  function argument(s). Instead, we chose to write a manual copy of this logic
+  for each of the 'top-level' operations, substituting only the 'combining
+  operation'.
+
+  We made this choice as any design based on either `helper1` or `helper2` is
+  significantly slower (at least 50% worse, and the penalty _percentage_ grows
+  with argument size). While `helper2` is significantly more penalizing than
+  `helper1`, even `helper1` reaches an almost threefold slowdown at the higher
+  input sizes we are interested in relative the manual version we use here.
+  Due to the 'low-level' nature of Plutus Core primops, we consider these costs
+  unacceptable relative the (small) benefits to code clarity and maintainability
+  any solution using either style of helper would provide.
+
+  The reason for `helper2` under-performing is unsurprising: any argument whose
+  type is rank-2 polymorphic with a dictionary constraint essentially acts as
+  a 'program template', which gets interpreted at runtime given some dictionary
+  for a `Bits` instance. GHC can do practically nothing to optimize this, as
+  there is no way to tell, for any given argument, _which_ definitions of an
+  instance would be required here, even if the set of operations we use is
+  finite, since any instance can make use of the full power of Haskell, which
+  essentially lands us in Rice's Theorem territory. For `helper1`, the reasons
+  are similar: it _must_ be able to work regardless of what functions (assuming
+  appropriate types) it is given, which means in general, GHC is forced to
+  compile mother-may-I-style code involving pointer chasing those arguments at
+  runtime. This explains why the 'blowup' becomes worse with argument length.
+
+  While in theory inlining could help with at least the `helper1` case (
+  `helper2` is beyond that technique), it doesn't seem like GHC is able to
+  figure this out, even with `INLINE` is placed on `helper1`.
+  -}
+
+-- | Bitwise logical AND, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122).
+{-# INLINEABLE andByteString #-}
+andByteString :: Bool -> ByteString -> ByteString -> ByteString
+andByteString shouldPad bs1 bs2 =
+  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
+      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
+   in go toCopy toTraverse (BS.length shorter)
+  where
+    go :: ByteString -> ByteString -> Int -> ByteString
+    go toCopy toTraverse traverseLen =
+      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
+        BS.useAsCString toTraverse $ \traversePtr -> do
+          BSI.create copyLen $ \dstPtr -> do
+            copyBytes dstPtr (castPtr copyPtr) copyLen
+            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
+            let offset = bigStrides * 8
+            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
+            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
+            for_ [0 .. bigStrides - 1] $ \i -> do
+              w64_1 <- peekElemOff bigDstPtr i
+              w64_2 <- peekElemOff bigTraversePtr i
+              pokeElemOff bigDstPtr i $ w64_1 Bits..&. w64_2
+            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
+            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
+            for_ [0 .. littleStrides - 1] $ \i -> do
+              w8_1 <- peekElemOff smallDstPtr i
+              w8_2 <- peekElemOff smallTraversePtr i
+              pokeElemOff smallDstPtr i $ w8_1 Bits..&. w8_2
+
+-- | Bitwise logical OR, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122).
+{-# INLINEABLE orByteString #-}
+orByteString :: Bool -> ByteString -> ByteString -> ByteString
+orByteString shouldPad bs1 bs2 =
+  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
+      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
+   in go toCopy toTraverse (BS.length shorter)
+  where
+    go :: ByteString -> ByteString -> Int -> ByteString
+    go toCopy toTraverse traverseLen =
+      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
+        BS.useAsCString toTraverse $ \traversePtr -> do
+          BSI.create copyLen $ \dstPtr -> do
+            copyBytes dstPtr (castPtr copyPtr) copyLen
+            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
+            let offset = bigStrides * 8
+            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
+            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
+            for_ [0 .. bigStrides - 1] $ \i -> do
+              w64_1 <- peekElemOff bigDstPtr i
+              w64_2 <- peekElemOff bigTraversePtr i
+              pokeElemOff bigDstPtr i $ w64_1 Bits..|. w64_2
+            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
+            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
+            for_ [0 .. littleStrides - 1] $ \i -> do
+              w8_1 <- peekElemOff smallDstPtr i
+              w8_2 <- peekElemOff smallTraversePtr i
+              pokeElemOff smallDstPtr i $ w8_1 Bits..|. w8_2
+
+-- | Bitwise logical XOR, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122).
+{-# INLINEABLE xorByteString #-}
+xorByteString :: Bool -> ByteString -> ByteString -> ByteString
+xorByteString shouldPad bs1 bs2 =
+  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
+      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
+   in go toCopy toTraverse (BS.length shorter)
+  where
+    go :: ByteString -> ByteString -> Int -> ByteString
+    go toCopy toTraverse traverseLen =
+      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
+        BS.useAsCString toTraverse $ \traversePtr -> do
+          BSI.create copyLen $ \dstPtr -> do
+            copyBytes dstPtr (castPtr copyPtr) copyLen
+            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
+            let offset = bigStrides * 8
+            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
+            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
+            for_ [0 .. bigStrides - 1] $ \i -> do
+              w64_1 <- peekElemOff bigDstPtr i
+              w64_2 <- peekElemOff bigTraversePtr i
+              pokeElemOff bigDstPtr i $ Bits.xor w64_1 w64_2
+            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
+            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
+            for_ [0 .. littleStrides - 1] $ \i -> do
+              w8_1 <- peekElemOff smallDstPtr i
+              w8_2 <- peekElemOff smallTraversePtr i
+              pokeElemOff smallDstPtr i $ Bits.xor w8_1 w8_2
+
+-- | Bitwise logical complement, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122).
+{-# INLINEABLE complementByteString #-}
+complementByteString :: ByteString -> ByteString
+complementByteString bs = unsafeDupablePerformIO . BS.useAsCStringLen bs $ \(srcPtr, len) -> do
+  -- We use loop sectioning here; see Note [Loop sectioning] as to why we do this
+  let (bigStrides, littleStrides) = len `quotRem` 8
+  let offset = bigStrides * 8
+  BSI.create len $ \dstPtr -> do
+    let bigSrcPtr :: Ptr Word64 = castPtr srcPtr
+    let bigDstPtr :: Ptr Word64 = castPtr dstPtr
+    for_ [0 .. bigStrides - 1] $ \i -> do
+      w64 <- peekElemOff bigSrcPtr i
+      pokeElemOff bigDstPtr i . Bits.complement $ w64
+    let smallSrcPtr :: Ptr Word8 = plusPtr srcPtr offset
+    let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
+    for_ [0 .. littleStrides - 1] $ \i -> do
+      w8 <- peekElemOff smallSrcPtr i
+      pokeElemOff smallDstPtr i . Bits.complement $ w8
+
+-- | Bit read at index, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122)
+{-# INLINEABLE readBit #-}
+readBit :: ByteString -> Int -> BuiltinResult Bool
+readBit bs ix
+  | ix < 0 = do
+      emit "readBit: index out of bounds"
+      emit $ "Index: " <> (pack . show $ ix)
+      evaluationFailure
+  | ix >= len * 8 = do
+      emit "readBit: index out of bounds"
+      emit $ "Index: " <> (pack . show $ ix)
+      evaluationFailure
+  | otherwise = do
+      let (bigIx, littleIx) = ix `quotRem` 8
+      let flipIx = len - bigIx - 1
+      pure $ Bits.testBit (BS.index bs flipIx) littleIx
+  where
+    len :: Int
+    len = BS.length bs
+
+-- | Bulk bit write, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122)
+{-# INLINEABLE writeBits #-}
+writeBits :: ByteString -> [(Integer, Bool)] -> BuiltinResult ByteString
+writeBits bs changelist = case unsafeDupablePerformIO . try $ go of
+  Left (WriteBitsException i) -> do
+    emit "writeBits: index out of bounds"
+    emit $ "Index: " <> (pack . show $ i)
+    evaluationFailure
+  Right result -> pure result
+  where
+    -- This is written in a somewhat strange way. See Note [writeBits and
+    -- exceptions], which covers why we did this.
+    go :: IO ByteString
+    go = BS.useAsCString bs $ \srcPtr ->
+          BSI.create len $ \dstPtr -> do
+            copyBytes dstPtr (castPtr srcPtr) len
+            traverse_ (setAtIx dstPtr) changelist
+    len :: Int
+    len = BS.length bs
+    bitLen :: Integer
+    bitLen = fromIntegral len * 8
+    setAtIx :: Ptr Word8 -> (Integer, Bool) -> IO ()
+    setAtIx ptr (i, b)
+      | i < 0 = throw $ WriteBitsException i
+      | i >= bitLen = throw $ WriteBitsException i
+      | otherwise = do
+          let (bigIx, littleIx) = i `quotRem` 8
+          let flipIx = len - fromIntegral bigIx - 1
+          w8 :: Word8 <- peekByteOff ptr flipIx
+          let toWrite = if b
+                        then Bits.setBit w8 . fromIntegral $ littleIx
+                        else Bits.clearBit w8 . fromIntegral $ littleIx
+          pokeByteOff ptr flipIx toWrite
+
+-- | Byte replication, as per [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122)
+replicateByte :: Int -> Word8 -> BuiltinResult ByteString
+replicateByte len w8
+  | len < 0 = do
+      emit "replicateByte: negative length requested"
+      evaluationFailure
+  | otherwise = pure . BS.replicate len $ w8
+
+{- Note [Shift and rotation implementation]
+
+Both shifts and rotations work similarly: they effectively impose a 'write
+offset' to bits in the data argument, then write those bits to the result
+with this offset applied. The difference between them is in what should be
+done if the resulting offset index would fall out of bounds: shifts just
+discard the data (and fill whatever remains with zeroes), while rotations
+'wrap around' modularly. This operation is bit parallel by definition, thus
+theoretically making it amenable to the techniques described in Note [Bit
+parallelism and loop sectioning].
+
+However, the naive way of doing this runs into a problem: the byte ordering
+on Tier 1 platforms inside `Word64` means that consecutive bit indexes
+according to CIP-122 don't remain that way. We could avoid this by using a
+byte flip followed by an adjustment in the opposite direction, then a byte flip
+back again. However, this is a costly operation, and would also be extremely
+fiddly across stride boundaries, making both performance and implementation
+clarity suffer. Instead, we use a different observation, namely that both
+shifts and rotations on the same input are monoidally homomorphic into
+natural number addition (assuming the same 'direction' for shifts). Using
+this, combined with Euclidean division, we can decompose any shift or
+rotation by `i` into two consecutive shifts and rotations:
+
+1. A 'large' shift or rotation, by `div i 8`; and
+2. A 'small' shift or rotation, by `mod i 8`.
+
+While on paper, this seems much less efficient (as our stride is smaller),
+we also observe that the 'large' shift moves around whole bytes, while also
+keeping consecutive bytes consecutive, assuming their bit indices remain
+in-bounds. This means that we can implement step 1 both simply and efficiently:
+
+* For shifts, we perform a partial copy of all the bytes whose bits remain
+  in-bounds, followed by clearing of whatever remains.
+* For rotations, we perform two partial copies: first of all the bytes whose
+  bits remain in-bounds, followed by whatever remains, at the 'opposite end'.
+
+These can make use of the bulk copying and clearing operations provided by the
+GHC runtime. Not only are these shorter and more readable, they are also _far_
+more efficient than anything we could do, as they rely on optimized C called
+via the runtime (meaning no FFI penalty). From our experiments, both with
+these operations, and others from CIP-122, we note that the cost of these is
+essentially constant up to about the size of 1-2 cache lines (64-128 bytes):
+since we anticipate smaller inputs are far more likely, this actually runs
+_faster_ than our proposed sectioning approach, while being easier to read
+and write.
+
+It is arguable that our approach forces 'double writing', as Step 2 has to
+possibly overwrite our work in Step 1. However, by avoiding the need to
+perform byte flips, as well as benefitting from the huge speedups gained
+from our split approach, this cost is essentially negligible, especially
+given that we can operate mutably throughout. We also have an additional
+benefit: if the requested rotation or shift happens to be an exact multiple
+of 8, we can be _much_ faster, as Step 2 becomes unnecessary in that case.
+-}
+
+-- | Shifts, as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+shiftByteString :: ByteString -> Int -> ByteString
+shiftByteString bs bitMove
+  | BS.null bs = bs
+  | bitMove == 0 = bs
+  | otherwise = unsafeDupablePerformIO . BS.useAsCString bs $ \srcPtr ->
+      BSI.create len $ \dstPtr -> do
+        -- To simplify our calculations, we work only with absolute values,
+        -- letting different functions control for direction, instead of
+        -- trying to unify the scheme for both positive and negative shifts.
+        let magnitude = abs bitMove
+        -- Instead of worrying about partial clearing, we just zero the entire
+        -- block of memory, as the cost is essentially negligible and saves us
+        -- a bunch of offset arithmetic.
+        fillBytes dstPtr 0x00 len
+        unless (magnitude >= bitLen) $ do
+          let (bigShift, smallShift) = magnitude `quotRem` 8
+          case signum bitMove of
+            (-1) -> negativeShift (castPtr srcPtr) dstPtr bigShift smallShift
+            _    -> positiveShift (castPtr srcPtr) dstPtr bigShift smallShift
+  where
+    len :: Int
+    !len = BS.length bs
+    bitLen :: Int
+    !bitLen = len * 8
+    negativeShift :: Ptr Word8 -> Ptr Word8 -> Int -> Int -> IO ()
+    negativeShift srcPtr dstPtr bigShift smallShift = do
+      let copyDstPtr = plusPtr dstPtr bigShift
+      let copyLen = len - bigShift
+      -- Since we already zeroed everything, we only do the partial copy.
+      copyBytes copyDstPtr srcPtr copyLen
+      when (smallShift > 0) $ do
+        -- When working with the small shift, we have to shift bits across
+        -- byte boundaries. Thus, we have to make sure that:
+        --
+        -- 1. We 'save' our first byte from being processed.
+        -- 2. We can 'select' the bits that would be shifted over the
+        --    boundary and apply them.
+        let !invSmallShift = 8 - smallShift
+        let !mask = 0xFF `Bits.unsafeShiftR` invSmallShift
+        for_ [len - 1, len - 2 .. len - copyLen] $ \byteIx -> do
+          -- To handle shifts across byte boundaries, we have to 'read
+          -- backwards', mask off the relevant part, then recombine.
+          !(currentByte :: Word8) <- peekByteOff dstPtr byteIx
+          !(prevByte :: Word8) <- peekByteOff dstPtr (byteIx - 1)
+          let !prevOverflowBits = prevByte Bits..&. mask
+          let !newCurrentByte =
+                (currentByte `Bits.unsafeShiftR` smallShift)
+                  Bits..|. (prevOverflowBits `Bits.unsafeShiftL` invSmallShift)
+          pokeByteOff dstPtr byteIx newCurrentByte
+        !(firstByte :: Word8) <- peekByteOff dstPtr (len - copyLen - 1)
+        pokeByteOff dstPtr (len - copyLen - 1) (firstByte `Bits.unsafeShiftR` smallShift)
+    -- This works similarly to `negativeShift` above, but in the opposite direction.
+    positiveShift :: Ptr Word8 -> Ptr Word8 -> Int -> Int -> IO ()
+    positiveShift srcPtr dstPtr bigShift smallShift = do
+      let copySrcPtr = plusPtr srcPtr bigShift
+      let copyLen = len - bigShift
+      copyBytes dstPtr copySrcPtr copyLen
+      when (smallShift > 0) $ do
+        let !invSmallShift = 8 - smallShift
+        let !mask = 0xFF `Bits.unsafeShiftL` invSmallShift
+        for_ [0, 1 .. copyLen - 2] $ \byteIx -> do
+          !(currentByte :: Word8) <- peekByteOff dstPtr byteIx
+          !(nextByte :: Word8) <- peekByteOff dstPtr (byteIx + 1)
+          let !nextOverflowBits = nextByte Bits..&. mask
+          let !newCurrentByte =
+                (currentByte `Bits.unsafeShiftL` smallShift)
+                  Bits..|. (nextOverflowBits `Bits.unsafeShiftR` invSmallShift)
+          pokeByteOff dstPtr byteIx newCurrentByte
+        !(lastByte :: Word8) <- peekByteOff dstPtr (copyLen - 1)
+        pokeByteOff dstPtr (copyLen - 1) (lastByte `Bits.unsafeShiftL` smallShift)
+
+-- | Rotations, as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+rotateByteString :: ByteString -> Int -> ByteString
+rotateByteString bs bitMove
+  | BS.null bs = bs
+  | otherwise =
+      -- To save ourselves some trouble, we work only with absolute rotations
+      -- (letting argument sign handle dispatch to dedicated 'directional'
+      -- functions, like for shifts), and also simplify rotations larger than
+      -- the bit length to the equivalent value modulo the bit length, as
+      -- they're equivalent.
+      let !magnitude = abs bitMove
+          !reducedMagnitude = magnitude `rem` bitLen
+       in if reducedMagnitude == 0
+            then bs
+            else unsafeDupablePerformIO . BS.useAsCString bs $ \srcPtr ->
+              BSI.create len $ \dstPtr -> do
+                let (bigRotation, smallRotation) = reducedMagnitude `quotRem` 8
+                case signum bitMove of
+                  (-1) -> negativeRotate (castPtr srcPtr) dstPtr bigRotation smallRotation
+                  _    -> positiveRotate (castPtr srcPtr) dstPtr bigRotation smallRotation
+  where
+    len :: Int
+    !len = BS.length bs
+    bitLen :: Int
+    !bitLen = len * 8
+    negativeRotate :: Ptr Word8 -> Ptr Word8 -> Int -> Int -> IO ()
+    negativeRotate srcPtr dstPtr bigRotate smallRotate = do
+      -- Two partial copies are needed here, unlike with shifts, because
+      -- there's no point zeroing our data, since it'll all be overwritten
+      -- with stuff from the input anyway.
+      let copyStartDstPtr = plusPtr dstPtr bigRotate
+      let copyStartLen = len - bigRotate
+      copyBytes copyStartDstPtr srcPtr copyStartLen
+      let copyEndSrcPtr = plusPtr srcPtr copyStartLen
+      copyBytes dstPtr copyEndSrcPtr bigRotate
+      when (smallRotate > 0) $ do
+        -- This works similarly as for shifts.
+        let invSmallRotate = 8 - smallRotate
+        let !mask = 0xFF `Bits.unsafeShiftR` invSmallRotate
+        !(cloneLastByte :: Word8) <- peekByteOff dstPtr (len - 1)
+        for_ [len - 1, len - 2 .. 1] $ \byteIx -> do
+          !(currentByte :: Word8) <- peekByteOff dstPtr byteIx
+          !(prevByte :: Word8) <- peekByteOff dstPtr (byteIx - 1)
+          let !prevOverflowBits = prevByte Bits..&. mask
+          let !newCurrentByte =
+                (currentByte `Bits.unsafeShiftR` smallRotate)
+                  Bits..|. (prevOverflowBits `Bits.unsafeShiftL` invSmallRotate)
+          pokeByteOff dstPtr byteIx newCurrentByte
+        !(firstByte :: Word8) <- peekByteOff dstPtr 0
+        let !lastByteOverflow = cloneLastByte Bits..&. mask
+        let !newLastByte =
+              (firstByte `Bits.unsafeShiftR` smallRotate)
+                Bits..|. (lastByteOverflow `Bits.unsafeShiftL` invSmallRotate)
+        pokeByteOff dstPtr 0 newLastByte
+    positiveRotate :: Ptr Word8 -> Ptr Word8 -> Int -> Int -> IO ()
+    positiveRotate srcPtr dstPtr bigRotate smallRotate = do
+      let copyStartSrcPtr = plusPtr srcPtr bigRotate
+      let copyStartLen = len - bigRotate
+      copyBytes dstPtr copyStartSrcPtr copyStartLen
+      let copyEndDstPtr = plusPtr dstPtr copyStartLen
+      copyBytes copyEndDstPtr srcPtr bigRotate
+      when (smallRotate > 0) $ do
+        let !invSmallRotate = 8 - smallRotate
+        let !mask = 0xFF `Bits.unsafeShiftL` invSmallRotate
+        !(cloneFirstByte :: Word8) <- peekByteOff dstPtr 0
+        for_ [0, 1 .. len - 2] $ \byteIx -> do
+          !(currentByte :: Word8) <- peekByteOff dstPtr byteIx
+          !(nextByte :: Word8) <- peekByteOff dstPtr (byteIx + 1)
+          let !nextOverflowBits = nextByte Bits..&. mask
+          let !newCurrentByte =
+                (currentByte `Bits.unsafeShiftL` smallRotate)
+                  Bits..|. (nextOverflowBits `Bits.unsafeShiftR` invSmallRotate)
+          pokeByteOff dstPtr byteIx newCurrentByte
+        !(lastByte :: Word8) <- peekByteOff dstPtr (len - 1)
+        let !firstOverflowBits = cloneFirstByte Bits..&. mask
+        let !newLastByte =
+              (lastByte `Bits.unsafeShiftL` smallRotate)
+                Bits..|. (firstOverflowBits `Bits.unsafeShiftR` invSmallRotate)
+        pokeByteOff dstPtr (len - 1) newLastByte
+
+-- | Counting the number of set bits, as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+countSetBits :: ByteString -> Int
+countSetBits bs = unsafeDupablePerformIO . BS.useAsCString bs $ \srcPtr -> do
+  -- See Note [Loop sectioning] for details of why we
+  -- define this function the way it is. We make use of the fact that `popCount`
+  -- is bit-parallel, and has a constant-time implementation for `Word64` and `Word8`.
+  let bigSrcPtr :: Ptr Word64 = castPtr srcPtr
+  let smallSrcPtr :: Ptr Word8 = plusPtr srcPtr offset
+  goBig bigSrcPtr smallSrcPtr 0 0
+  where
+    len :: Int
+    !len = BS.length bs
+    -- We do this as two separate bindings, for similar reasons as for
+    -- `integerToByteString`: we take a surprising hit to performance when
+    -- using a pair, even though eliminating it should be possible here.
+    bigStrides :: Int
+    !bigStrides = len `quot` 8
+    smallStrides :: Int
+    !smallStrides = len `rem` 8
+    offset :: Int
+    !offset = bigStrides * 8
+    goBig :: Ptr Word64 -> Ptr Word8 -> Int -> Int -> IO Int
+    goBig !bigSrcPtr !smallSrcPtr !acc !bigIx
+      | bigIx == bigStrides = goSmall smallSrcPtr acc 0
+      | otherwise = do
+          !w64 <- peekElemOff bigSrcPtr bigIx
+          goBig bigSrcPtr smallSrcPtr (acc + Bits.popCount w64) (bigIx + 1)
+    goSmall :: Ptr Word8 -> Int -> Int -> IO Int
+    goSmall !smallSrcPtr !acc !smallIx
+      | smallIx == smallStrides = pure acc
+      | otherwise = do
+          !w8 <- peekElemOff smallSrcPtr smallIx
+          goSmall smallSrcPtr (acc + Bits.popCount w8) (smallIx + 1)
+
+-- | Finding the first set bit's index, as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+findFirstSetBit :: ByteString -> Int
+findFirstSetBit bs = unsafeDupablePerformIO . BS.useAsCString bs $ \srcPtr -> do
+  let bigSrcPtr :: Ptr Word64 = castPtr srcPtr
+  goBig bigSrcPtr 0 (len - 8)
+  where
+    -- We implement this operation in a somewhat unusual way, to try and
+    -- benefit from bit paralellism, thus allowing loop sectioning as well:
+    -- see Note [Loop sectioning] as to why we choose to
+    -- do this.
+    --
+    -- Finding the first set bit is not (inherently) bit parallel, as there is
+    -- a clear 'horizontal dependency'. Thus, we instead 'localize' this
+    -- 'horizontal dependency' by noting that the following operations _are_
+    -- bit-parallel:
+    --
+    -- 1. Checking if all bits are zero
+    -- 2. Keeping an additive accumulator
+    --
+    -- Essentially, we begin by taking large steps through our data, checking
+    -- whether we only have zeroes. This can be done in strides of 64 bits at a
+    -- time, and every time we find that many zeroes, we keep track. After we
+    -- encounter a nonzero `Word64`, we 'step down' to `Word8`-sized steps,
+    -- continuing to count zero blocks the same way. Once we encounter a
+    -- non-zero `Word8`, we can resort to the specialized operation for
+    -- counting trailing zeroes from `Data.Bits`, and 'top up' our accumulated
+    -- count to produce the index we want. If we ever 'walk off the end', we
+    -- know that there's no way we could find any set bits and return -1.
+    --
+    -- This is complicated slightly by us having to walk the input backwards
+    -- instead of forwards, but due to the requirements of the CIP-122 bit
+    -- indexing scheme, we don't really have a choice here. This doesn't
+    -- affect the description above however: it just complicates the indexing
+    -- maths required.
+    goBig :: Ptr Word64 -> Int -> Int -> IO Int
+    goBig !bigSrcPtr !acc !byteIx
+      | byteIx >= 0 = do
+          !(w64 :: Word64) <- peekByteOff bigSrcPtr byteIx
+          -- In theory, we could use the same technique here as we do in
+          -- `goSmall`, namely count speculatively and then compare to 64.
+          -- However this is not possible for us, as the native byte ordering
+          -- on Tier 1 platforms does not keep consecutive bits _across_ bytes
+          -- consecutive, which would make this result unreliable. While we
+          -- _could_ do a byte order flip before counting (from the opposite
+          -- end) to avoid this, the cost of this operation is much larger
+          -- than a comparison to zero, and would only benefit us _once_,
+          -- instead of once-per-stride. Thus, we instead use the approach
+          -- here.
+          if w64 == 0x0
+            then goBig bigSrcPtr (acc + 64) (byteIx - 8)
+            else goSmall (castPtr bigSrcPtr) acc (byteIx + 7)
+      | byteIx <= (-8) = pure (-1)
+      | otherwise = goSmall (castPtr bigSrcPtr) 0 (8 + byteIx - 1)
+    goSmall :: Ptr Word8 -> Int -> Int -> IO Int
+    goSmall !smallSrcPtr !acc !byteIx
+      | byteIx < 0 = pure (-1)
+      | otherwise = do
+          !(w8 :: Word8) <- peekByteOff smallSrcPtr byteIx
+          -- Instead of redundantly first checking for a zero byte,
+          -- then counting, we speculatively count, relying on the behaviour of
+          -- `countTrailingZeros` that, on a zero byte, we get 8.
+          let !counted = Bits.countTrailingZeros w8
+          let !newAcc = acc + counted
+          if counted == 8
+            then goSmall smallSrcPtr newAcc (byteIx - 1)
+            else pure newAcc
+    len :: Int
+    !len = BS.length bs
+
+-- Helpers
+
+{- Note [writeBits and exceptions]
+
+   As `writeBits` allows us to pass a changelist argument of any length, we
+   potentially could have an out-of-bounds index anywhere in the list. As we
+   have to fail on such cases (and report them appropriately), we end up needing
+   _both_ IO (to do mutable things) as well as a way to signal errors. We can
+   do this in two ways:
+
+   1. Pre-scan the changelist for any out-of-bounds indexes, fail if we see any,
+      then apply the necessary changes if no out-of-bounds indexes are found.
+   2. Speculatively allocate the new `ByteString`, then do the changes in the
+      changelist argument one at a time, failing as soon as we see an out-of-bounds
+      index.
+
+  Option 1 would require traversing the changelist argument twice, which is
+  undesirable, which means that option 2 is the more efficient choice. The
+  natural choice for option 2 would be something similar to `ExceptT Int IO`
+  (with the `Int` being an out-of-bounds index). However, we aren't able to do
+  this, as ultimately, `ByteString`s are implemented as `ForeignPtr`s, forcing
+  us to use the following function to interact with them, directly or not:
+
+  withForeignPtr :: forall (a :: Type) . ForeignPtr a -> (Ptr a -> IO b) -> IO b
+
+  Notably, the function argument produces a result of `IO b`, whereas we would
+  need `MonadIO m => m b` instead. This means that our _only_ choice is to
+  use the exception mechanism, either directly or via some wrappers like
+  `MonadUnliftIO`. While this is unusual, and arguably against the spirit of
+  the use of `IO` relative `ByteString` construction, we don't have any other
+  choice. We decided to use the exception mechanism directly, as while
+  `MonadUnliftIO` is a bit cleaner, it ultimately ends up doing the same thing
+  anyway, and this method at least makes it clear what we're doing.
+
+  This doesn't pose any problems from the point of view of Plutus Core, as this
+  exception cannot 'leak'; we handle it entirely within `writeBits`, and no
+  other Plutus Core code can ever see it.
+-}
+newtype WriteBitsException = WriteBitsException Integer
+  deriving stock (Eq, Show)
+
+instance Exception WriteBitsException
+
+{- Note [Manual specialization]
+For both integerToByteString and byteStringToInteger, we have to perform very
+similar operations, but with small variations:
+
+- Most-significant-first versus most-significant-last (for both)
+- Whether we have a size limit or not (for integerToByteString)
+
+Additionally, loop sectioning (see Note [Loop sectioning]) requires us to have
+separate 'big-stride' and 'small-stride' operations to ensure universality of
+input handling. Lastly, we have several subroutines (digit extraction, for
+instance) that may vary in similar ways. In such a case, generalization by
+means of abstraction seems like a good idea, as the operations (and
+subroutines) vary little.
+
+At the same time, to determine which variation of any given function (or
+subroutine) we need, we only have to scrutinize the relevant argument(s) once:
+these specifics (such as byte order) don't change during the course of the
+operation. Thus, we want to make sure that these checks in the code are _also_
+performed only once, ideally at the beginning.
+
+However, if we write such operations naively as so:
+
+> subroutine byteOrder arg1 arg2 = case byteOrder of
+>       LittleEndian -> ...
+>       BigEndian -> ...
+
+the byteOrder argument will be scrutinized on each call of subroutine. This is
+correct in general (as there is no guarantee that the argument will be stable).
+Strangely, however, even in a case like this one:
+
+> mainRoutine byteOrder arg1 arg2 = ...
+>    where
+>       subroutine arg3 = case byteOrder of
+>           LittleEndian -> ...
+>           BigEndian -> ...
+
+GHC _still_ re-scrutinizes byteOrder in every call of subroutine! This penalty
+can be somewhat lessened using a form similar to this:
+
+> mainRoutine byteOrder arg1 arg2 = ...
+>     where
+>        !subroutine = case byteOrder of
+>             LittleEndian -> \arg3 -> ...
+>             BigEndian -> \arg3 -> ...
+
+but this is _still_ between 20 and 30% worse than doing something like this:
+
+> mainRoutine byteOrder arg1 arg2 = case byteOrder of
+>     LittleEndian -> [code calling subroutineLE where needed]
+>     BigEndian -> [code calling subroutineBE where needed]
+>     where
+>         subroutineLE arg3 = ...
+>         subroutineBE arg3 = ...
+
+This form _ensures_ we scrutinize (and branch) only the number of times we have
+to, and in a predictable place. Since these are backends for Plutus Core primops,
+and performance is thus critical, we choose to use this manually-specialized form
+for each combination of relevant arguments. While this is repetitive, and thus
+also somewhat error-prone, the performance penalty for not doing this is
+unacceptable.
+-}
+
+{- Note [Loop sectioning]
+
+Several operations in this module (including binary logical operations,
+`integerToByteString` and `byteStringToInteger`) effectively function as loops
+over fixed-width binary chunks: these can be bytes (for logical operations),
+digits (for conversions) or something else. These chunks have to be read,
+written or both, and may also require processing using fixed-width,
+constant-time operations over those chunks from the Haskell side, in some
+cases effectively 'translating' these fixed-size operations into variable-width
+equivalents over `ByteString`s. In all cases, this involves trafficking data
+between memory and machine registers (as `ByteString`s and `Integer`s are both
+wrappers around counted arrays), as well as the overheads of looping
+(involving comparison and branches). This trafficking is necessary not only to
+move the memory around, but also to process it, as on modern architectures,
+data must first be moved into a register in order to do anything with it.
+
+However, on all architectures of interest (essentially, 64-bit Tier 1),
+general-purpose registers (GPRs henceforth) are 64 bits (or 8 bytes).
+Furthermore, the primary cost of moving data between memory and registers is
+having to overcome the 'memory wall': the exact amount of data being moved
+doesn't affect this very much. In addition to this, when we operate on single
+bytes, the remaining 56 bits of the GPR holding that data are essentially
+'wasted'. In the situation we have (namely, operating over arrays, whose data
+is adjacent in memory), we thus get two sources of inefficiency:
+
+* Despite paying the cost for a memory transfer, we transfer only one-eighth
+  the data we could; and
+* Despite transferring data from memory to registers, we utilize the register
+  at only one-eighth capacity.
+
+This essentially means we perform _eight times_ more rotations of the loop,
+and memory moves, than we need to!
+
+To avoid this, we use a technique known as _loop sectioning_.
+Effectively, this transforms our homogenous loop (that always operates one byte at
+a time) into a heterogenous loop: first, we operate on a larger section (called
+a _stride_) until we can no longer do this, and then we finish up using byte
+at a time processing. Essentially, when given an input like this:
+
+[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
+
+the homogeous byte-at-a-time approach would process it like so:
+
+  _   _   _   _   _   _   _   _   _   _
+[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
+
+for a total of 10 memory transfers and 10 loop spins, whereas a loop-sectioned
+approach with a stride of 8 would instead process like so:
+
+  ______________________________  _   _
+[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
+
+Giving us only _three_ memory transfers and _three_ loop spins instead. This
+effectively reduces our work by a factor of 8. In our cases, this is almost
+free, as there is no data processing to be done: all we need to do is copy
+from one place to another, essentially.
+
+This technique only benefits us because counted arrays are cache-friendly: see
+Note [Superscalarity and caching] for a longer explanation of this and why it
+matters.
+
+Further information:
+
+- Tier 1 GHC platform list:
+    https://gitlab.haskell.org/ghc/ghc/-/wikis/platforms#tier-1-platforms
+- Memory wall:
+    https://link.springer.com/referenceworkentry/10.1007/978-0-387-09766-4_234
+- Loop sectioning in more detail:
+    http://physics.ujep.cz/~zmoravec/prga/main_for/mergedProjects/optaps_for/common/optaps_vec_mine.htm
+-}
+
+{- Note [Superscalarity and caching]
+On modern architectures, in order to process data, it must first be moved from
+memory into a register. This operation has some cost (known as the 'memory wall'),
+which is largely independent of how much data gets moved (assuming the register
+can hold it): moving one byte, or a whole register's worth, costs about the same.
+To reduce this cost, CPU manufacturers have introduced _cache hierarchies_,
+which are designed to limit the cost of the wall, as long as the data access
+matches the cache's optimal usage pattern. Thus, while an idealized view of
+the memory hierachy is this:
+
+Registers
+---------
+Memory
+
+in reality, the view is more like this:
+
+Registers
+---------
+L1 cache
+---------
+L2 cache
+---------
+L3 cache (on some platforms)
+---------
+Memory
+
+Each 'higher' cache in the hierarchy is smaller, but faster, and when a memory
+fetch is requested in code, in addition to moving the requested data to a
+register, that data (plus some more) is moved into caches as well. The amount
+of data moved into cache (a _cache line_) is typically eight machine words on
+modern architectures (and definitely is the case for all Tier 1 GHC platforms):
+for the cases concerning Plutus, that is 64 bytes. Therefore, if data we need
+soon after a fetch is _physically_ nearby, it won't need to be fetched from
+memory: instead, it would come from a cache, which is faster (by a considerable
+margin).
+
+To see how this can matter, consider the following ByteString:
+
+[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
+
+The ByteString (being a counted array) has all of its data physically adjacent
+to each other. Suppose we wanted to fetch the byte at index 1 (second position).
+The naive view of what happens is like this:
+
+Registers: [b2] [ ] [ ] .... [ ]
+Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
+
+Thus, it would appear that, if we wanted a different position's value, we would
+need to fetch from memory again. However, what _actually_ happens is more like this:
+
+Registers: [b2] [ ] [ ] .... [ ]
+L1 cache: [ b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
+Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
+
+We note that b2, as well as its adjacent elements, were _all_ pulled into the L1
+cache. This can only work because all these elements are physically adjacent in
+memory. The improvement in performance from this cache use is _very_ non-trivial:
+an L1 cache is about 200 times faster than a memory access, and an L2 cache about
+20 times faster.
+
+To take further advantage of this, modern CPUs (and all Tier 1 GHC platforms have
+this capability) are _superscalar_. To explain what this means, let's consider the
+naive view of how CPUs execute instructions: namely, it is one-at-a-time, and
+synchronous. While CPUs must give the _appearance_ that they behave this way, in
+practice, CPU execution is very much asynchronous: due to the proliferation of ALUs
+on a single chip, having twice as many processing units is much cheaper than having
+processing units run twice as fast. Thus, if there are no data dependencies
+between instructions, CPUs can (and do!) execute them simultaneously, stalling to
+await results if a data dependency is detected. This can be done automatically
+using Tomasulo's algorithm, which ensures no conflicts with maximum throughput.
+
+Superscalarity interacts well with the cache hierarchy, as it makes data more
+easily available for processing, provided there is enough 'work to do', and no
+data dependencies. In our situation, most of what we do is data _movement_ from
+one memory location to another, which by its very nature lacks any data
+dependencies.
+
+Further references:
+
+- Numbers for cache and memory transfers: https://gist.github.com/jboner/2841832
+- Superscalarity: https://en.wikipedia.org/wiki/Superscalar_processor
+- Tomasulo's algorithm: https://en.wikipedia.org/wiki/Tomasulo%27s_algorithm
+-}
diff --git a/plutus-core/plutus-core/src/PlutusCore/Bitwise/Convert.hs b/plutus-core/plutus-core/src/PlutusCore/Bitwise/Convert.hs
deleted file mode 100644
index bd6ccd317eb..00000000000
--- a/plutus-core/plutus-core/src/PlutusCore/Bitwise/Convert.hs
+++ /dev/null
@@ -1,544 +0,0 @@
--- editorconfig-checker-disable-file
-
-{-# LANGUAGE BangPatterns      #-}
-{-# LANGUAGE MagicHash         #-}
-{-# LANGUAGE OverloadedStrings #-}
-
--- | Implementations for conversion primops from 'Integer' to 'ByteString' and back again.
-module PlutusCore.Bitwise.Convert (
-  -- Wrappers
-  integerToByteStringWrapper,
-  byteStringToIntegerWrapper,
-  -- Implementation details
-  IntegerToByteStringError(..),
-  integerToByteStringMaximumOutputLength,
-  integerToByteString,
-  byteStringToInteger
-  ) where
-
-import PlutusCore.Builtin (BuiltinResult, emit)
-import PlutusCore.Evaluation.Result (evaluationFailure)
-
-import ByteString.StrictBuilder (Builder)
-import ByteString.StrictBuilder qualified as Builder
-import Control.Monad (guard)
-import Data.Bits (unsafeShiftL, unsafeShiftR, (.|.))
-import Data.ByteString (ByteString)
-import Data.ByteString qualified as BS
-import Data.Text (pack)
-import Data.Word (Word64, Word8)
-import GHC.ByteOrder (ByteOrder (BigEndian, LittleEndian))
-import GHC.Exts (Int (I#))
-import GHC.Integer.Logarithms (integerLog2#)
-
-{- Note [Input length limitation for IntegerToByteString].  We make
-   `integerToByteString` fail if it is called with arguments which would cause
-   the length of the result to exceed about 8K bytes because the execution time
-   becomes difficult to predict accurately beyond this point (benchmarks on a
-   number of different machines show that the CPU time increases smoothly for
-   inputs up to about 8K then increases sharply, becoming chaotic after about
-   14K).  This restriction may be removed once a more efficient implementation
-   becomes available, which may happen when we no longer have to support GHC
-   8.10. -}
-{- NB: if we do relax the length restriction then we will need two variants of
-   integerToByteString in Plutus Core so that we can continue to support the
-   current behaviour for old scripts.-}
-integerToByteStringMaximumOutputLength :: Integer
-integerToByteStringMaximumOutputLength = 8192
-
-{- Return the base 2 logarithm of an integer, returning 0 for inputs that aren't
-   strictly positive.  This is essentially copied from GHC.Num.Integer, which
-   has integerLog2 but only in GHC >= 9.0. We should use the library function
-   instead when we stop supporting 8.10. -}
-integerLog2 :: Integer -> Int
-integerLog2 !i = I# (integerLog2# i)
-
--- | Wrapper for 'integerToByteString' to make it more convenient to define as a builtin.
-integerToByteStringWrapper :: Bool -> Integer -> Integer -> BuiltinResult ByteString
-integerToByteStringWrapper endiannessArg lengthArg input
-  -- Check that the length is non-negative.
-  | lengthArg < 0 = do
-      emit "integerToByteString: negative length argument"
-      emit $ "Length requested: " <> (pack . show $ input)
-      evaluationFailure
-  -- Check that the requested length does not exceed the limit.  *NB*: if we remove the limit we'll
-  -- still have to make sure that the length fits into an Int.
-  | lengthArg > integerToByteStringMaximumOutputLength = do
-      emit . pack $ "integerToByteString: requested length is too long (maximum is "
-               ++ (show $ integerToByteStringMaximumOutputLength)
-               ++ " bytes)"
-      emit $ "Length requested: " <> (pack . show $ lengthArg)
-      evaluationFailure
-  -- If the requested length is zero (ie, an explicit output size is not
-  -- specified) we still have to make sure that the output won't exceed the size
-  -- limit.  If the requested length is nonzero and less than the limit,
-  -- integerToByteString checks that the input fits.
-  | (lengthArg == 0 -- integerLog2 n is one less than the number of significant bits in n
-       && fromIntegral (integerLog2 input) >= 8 * integerToByteStringMaximumOutputLength) =
-    let bytesRequiredFor n = (integerLog2 n) `div` 8 + 1
-        -- ^ This gives 1 instead of 0 for n=0, but we'll never get that.
-    in do
-      emit . pack $ "integerToByteString: input too long (maximum is 2^"
-               ++ (show (8 * integerToByteStringMaximumOutputLength))
-               ++ "-1)"
-      emit $ "Length required: " <> (pack . show $ bytesRequiredFor input)
-      evaluationFailure
-  | otherwise = let endianness = endiannessArgToByteOrder endiannessArg in
-    -- We use fromIntegral here, despite advice to the contrary in general when defining builtin
-    -- denotations. This is because, if we've made it this far, we know that overflow or truncation
-    -- are impossible: we've checked that whatever we got given fits inside a (non-negative) Int.
-    case integerToByteString endianness (fromIntegral lengthArg) input of
-      Left err -> case err of
-        NegativeInput -> do
-          emit "integerToByteString: cannot convert negative Integer"
-          -- This does work proportional to the size of input. However, we're in a failing case
-          -- anyway, and the user's paid for work proportional to this size in any case.
-          emit $ "Input: " <> (pack . show $ input)
-          evaluationFailure
-        NotEnoughDigits -> do
-          emit "integerToByteString: cannot represent Integer in given number of bytes"
-          -- This does work proportional to the size of input. However, we're in a failing case
-          -- anyway, and the user's paid for work proportional to this size in any case.
-          emit $ "Input: " <> (pack . show $ input)
-          emit $ "Bytes requested: " <> (pack . show $ lengthArg)
-          evaluationFailure
-      Right result -> pure result
-
--- | Wrapper for 'byteStringToInteger' to make it more convenient to define as a builtin.
-byteStringToIntegerWrapper ::
-  Bool -> ByteString -> Integer
-byteStringToIntegerWrapper statedEndiannessArg input =
-  let endianness = endiannessArgToByteOrder statedEndiannessArg in
-    byteStringToInteger endianness input
-
--- | Structured type to help indicate conversion errors.
-data IntegerToByteStringError =
-  NegativeInput |
-  NotEnoughDigits
-  deriving stock (Eq, Show)
-
--- | Conversion from 'Integer' to 'ByteString', as per
--- [CIP-121](https://github.com/mlabs-haskell/CIPs/tree/koz/to-from-bytestring/CIP-0121).
---
--- For performance and clarity, the endianness argument uses
--- 'ByteOrder', and the length argument is an 'Int'.
-integerToByteString :: ByteOrder -> Int -> Integer -> Either IntegerToByteStringError ByteString
-integerToByteString requestedByteOrder requestedLength input
-  | input < 0 = Left NegativeInput
-  | input == 0 = Right . BS.replicate requestedLength $ 0x00
-  -- We use manual specialization to ensure as few branches in loop bodies as
-  -- we can. See Note [Manual specialization] for details.
-  | requestedLength == 0 = Right . Builder.builderBytes $ case requestedByteOrder of
-      LittleEndian -> goLENoLimit mempty input
-      BigEndian    -> goBENoLimit mempty input
-  | otherwise = do
-      let result = case requestedByteOrder of
-                    LittleEndian -> goLELimit mempty input
-                    BigEndian    -> goBELimit mempty input
-      case result of
-        Nothing -> Left NotEnoughDigits
-        Just b  -> Right . Builder.builderBytes $ b
-  where
-    goLELimit :: Builder -> Integer -> Maybe Builder
-    goLELimit acc remaining
-      | remaining == 0 = pure $ padLE acc
-      | otherwise = do
-          -- builderLength is constant time, so we don't track the length ourselves
-          guard (Builder.builderLength acc < requestedLength)
-          -- This allows extracting eight digits at once. See Note [Loop sectioning] for details on
-          -- why we do this. We also duplicate this code in several places: see Note [Manual
-          -- specialization] for why.
-          --
-          -- The code is basically equivalent to remaining `quotRem` 2^64, but more efficient. This
-          -- is for two reasons: firstly, GHC does not optimize divisions into shifts for Integer
-          -- (even if the divisor is constant), and secondly, the pair generated by `quotRem` costs
-          -- us as much as 15% peformance, and GHC seems unable to eliminate it. Thus, we have to do
-          -- it like this instead.
-          let newRemaining = remaining `unsafeShiftR` 64
-          -- Given that remaining must be non-negative, fromInteger here effectively truncates to a
-          -- Word64, by retaining only the least-significant 8 bytes.
-          let digitGroup :: Word64 = fromInteger remaining
-          case newRemaining of
-            0 -> finishLELimit acc digitGroup
-            _ -> goLELimit (acc <> Builder.storable digitGroup) newRemaining
-    finishLELimit :: Builder -> Word64 -> Maybe Builder
-    finishLELimit acc remaining
-      | remaining == 0 = pure $ padLE acc
-      | otherwise = do
-          guard (Builder.builderLength acc < requestedLength)
-          -- This is equivalent to 'remaining `quotRem` 256' followed by a conversion of the
-          -- remainder, but faster. This is similar to the larger example above, and we do it for
-          -- the same reasons.
-          let newRemaining = remaining `unsafeShiftR` 8
-          let digit :: Word8 = fromIntegral remaining
-          finishLELimit (acc <> Builder.word8 digit) newRemaining
-    -- By separating the case where we don't need to concern ourselves with a
-    -- user-specified limit, we can avoid branching needlessly, or doing a
-    -- complex expression check on every loop. See Note [Manual specialization]
-    -- for why this matters.
-    goLENoLimit :: Builder -> Integer -> Builder
-    goLENoLimit acc remaining
-      | remaining == 0 = acc
-      | otherwise = let newRemaining = remaining `unsafeShiftR` 64
-                        digitGroup :: Word64 = fromInteger remaining
-                      in case newRemaining of
-                        0 -> finishLENoLimit acc digitGroup
-                        _ -> goLENoLimit (acc <> Builder.storable digitGroup) newRemaining
-    finishLENoLimit :: Builder -> Word64 -> Builder
-    finishLENoLimit acc remaining
-      | remaining == 0 = acc
-      | otherwise =
-          let newRemaining = remaining `unsafeShiftR` 8
-              digit :: Word8 = fromIntegral remaining
-           in finishLENoLimit (acc <> Builder.word8 digit) newRemaining
-    padLE :: Builder -> Builder
-    padLE acc = let paddingLength = requestedLength - Builder.builderLength acc
-      in acc <> Builder.bytes (BS.replicate paddingLength 0x0)
-    -- We manually specialize the big-endian case: see Note [Manual specialization] for why.
-    goBELimit :: Builder -> Integer -> Maybe Builder
-    goBELimit acc remaining
-      | remaining == 0 = pure $ padBE acc
-      | otherwise = do
-          guard (Builder.builderLength acc < requestedLength)
-          let newRemaining = remaining `unsafeShiftR` 64
-          let digitGroup :: Word64 = fromInteger remaining
-          case newRemaining of
-            0 -> finishBELimit acc digitGroup
-            _ -> goBELimit (Builder.word64BE digitGroup <> acc) newRemaining
-    finishBELimit :: Builder -> Word64 -> Maybe Builder
-    finishBELimit acc remaining
-      | remaining == 0 = pure $ padBE acc
-      | otherwise = do
-          guard (Builder.builderLength acc < requestedLength)
-          let newRemaining = remaining `unsafeShiftR` 8
-          let digit = fromIntegral remaining
-          finishBELimit (Builder.word8 digit <> acc) newRemaining
-    goBENoLimit :: Builder -> Integer -> Builder
-    goBENoLimit acc remaining
-      | remaining == 0 = acc
-      | otherwise = let newRemaining = remaining `unsafeShiftR` 64
-                        digitGroup = fromInteger remaining
-                      in case newRemaining of
-                        0 -> finishBENoLimit acc digitGroup
-                        _ -> goBENoLimit (Builder.word64BE digitGroup <> acc) newRemaining
-    finishBENoLimit :: Builder -> Word64 -> Builder
-    finishBENoLimit acc remaining
-      | remaining == 0 = acc
-      | otherwise = let newRemaining = remaining `unsafeShiftR` 8
-                        digit = fromIntegral remaining
-                      in finishBENoLimit (Builder.word8 digit <> acc) newRemaining
-    padBE :: Builder -> Builder
-    padBE acc = let paddingLength = requestedLength - Builder.builderLength acc in
-      Builder.bytes (BS.replicate paddingLength 0x0) <> acc
-
--- | Conversion from 'ByteString' to 'Integer', as per
--- [CIP-121](https://github.com/mlabs-haskell/CIPs/tree/koz/to-from-bytestring/CIP-0121).
---
--- For clarity, the stated endianness argument uses 'ByteOrder'.
-byteStringToInteger :: ByteOrder -> ByteString -> Integer
-  -- We use manual specialization to ensure as few branches in loop bodies as we can. See Note
-  -- [Manual specialization] for details.
-byteStringToInteger statedByteOrder input = case statedByteOrder of
-    -- Since padding bytes in the most-significant-last representation go at
-    -- the end of the input, we can skip decoding them, as they won't affect
-    -- the result in any way.
-    LittleEndian -> case BS.findIndexEnd (/= 0x00) input of
-      -- If there are no nonzero bytes, it must be zero.
-      Nothing  -> 0
-      Just end -> goLE 0 end 0
-    -- Since padding bytes in the most-significant-first representation go at
-    -- the beginning of the input, we can skip decoding them, as they won't
-    -- affect the result in any way.
-    BigEndian -> case BS.findIndex (/= 0x00) input of
-      Nothing  -> 0
-      Just end -> goBE 0 end 0 (BS.length input - 1)
-  where
-    -- Like with toByteString, we use loop sectioning to decode eight digits at once. See Note [Loop
-    -- sectioning] for why we do this.
-    goLE :: Integer -> Int -> Int -> Integer
-    goLE acc limit ix
-      | ix <= (limit - 7) =
-          let digitGroup = read64LE ix
-              -- Same as ix * 8, but faster. GHC might already do this optimization, but we may as
-              -- well be sure.
-              shift = ix `unsafeShiftL` 3
-              newIx = ix + 8
-              -- We use unsafeShiftL to move a group of eight digits into the right position in
-              -- the result, then combine with the accumulator. This is equivalent to a
-              -- multiplication by 2^64*k, but significantly faster, as GHC doesn't optimize
-              -- such multiplications into shifts for Integers.
-              newAcc = acc + fromIntegral digitGroup `unsafeShiftL` shift
-            in goLE newAcc limit newIx
-      | otherwise = finishLE acc limit ix
-    finishLE :: Integer -> Int -> Int -> Integer
-    finishLE acc limit ix
-      | ix > limit = acc
-      | otherwise =
-          let digit = BS.index input ix
-              shift = ix `unsafeShiftL` 3
-              newIx = ix + 1
-              -- Similarly to before, we use unsafeShiftL to move a single digit into the right
-              -- position in the result.
-              newAcc = acc + fromIntegral digit `unsafeShiftL` shift
-            in finishLE newAcc limit newIx
-    -- Technically, ByteString does not allow reading of anything bigger than a single byte.
-    -- However, because ByteStrings are counted arrays, caching already brings in adjacent bytes,
-    -- which makes fetching them quite cheap. Additionally, GHC appears to optimize this into a
-    -- block read of 64 bits at once, which saves memory movement. See Note [Superscalarity and
-    -- caching] for details of why this matters.
-    read64LE :: Int -> Word64
-    read64LE startIx =
-      fromIntegral (BS.index input startIx)
-        .|. (fromIntegral (BS.index input (startIx + 1)) `unsafeShiftL` 8)
-        .|. (fromIntegral (BS.index input (startIx + 2)) `unsafeShiftL` 16)
-        .|. (fromIntegral (BS.index input (startIx + 3)) `unsafeShiftL` 24)
-        .|. (fromIntegral (BS.index input (startIx + 4)) `unsafeShiftL` 32)
-        .|. (fromIntegral (BS.index input (startIx + 5)) `unsafeShiftL` 40)
-        .|. (fromIntegral (BS.index input (startIx + 6)) `unsafeShiftL` 48)
-        .|. (fromIntegral (BS.index input (startIx + 7)) `unsafeShiftL` 56)
-    -- We manually specialize the big-endian cases: see Note [Manual specialization] for why.
-    --
-    -- In the big-endian case, shifts and indexes change in different ways: indexes start _high_
-    -- and _reduce_, but shifts start _low_ and rise. This is different to the little-endian case,
-    -- where both start low and rise. Thus, we track the index and shift separately in the
-    -- big-endian case: it makes the adjustments easier, and doesn't really change anything, as if
-    -- we wanted to compute the shift, we'd have to pass an offset argument anyway.
-    goBE :: Integer -> Int -> Int -> Int -> Integer
-    goBE acc limit shift ix
-      | ix >= (limit + 7) =
-          let digitGroup = read64BE ix
-              newShift = shift + 64
-              newIx = ix - 8
-              newAcc = acc + fromIntegral digitGroup `unsafeShiftL` shift
-           in goBE newAcc limit newShift newIx
-      | otherwise = finishBE acc limit shift ix
-    finishBE :: Integer -> Int -> Int -> Int -> Integer
-    finishBE acc limit shift ix
-      | ix < limit = acc
-      | otherwise =
-          let digit = BS.index input ix
-              newShift = shift + 8
-              newIx = ix - 1
-              newAcc = acc + fromIntegral digit `unsafeShiftL` shift
-           in finishBE newAcc limit newShift newIx
-    read64BE :: Int -> Word64
-    read64BE endIx =
-      fromIntegral (BS.index input endIx)
-        .|. (fromIntegral (BS.index input (endIx - 1)) `unsafeShiftL` 8)
-        .|. (fromIntegral (BS.index input (endIx - 2)) `unsafeShiftL` 16)
-        .|. (fromIntegral (BS.index input (endIx - 3)) `unsafeShiftL` 24)
-        .|. (fromIntegral (BS.index input (endIx - 4)) `unsafeShiftL` 32)
-        .|. (fromIntegral (BS.index input (endIx - 5)) `unsafeShiftL` 40)
-        .|. (fromIntegral (BS.index input (endIx - 6)) `unsafeShiftL` 48)
-        .|. (fromIntegral (BS.index input (endIx - 7)) `unsafeShiftL` 56)
-
-endiannessArgToByteOrder :: Bool -> ByteOrder
-endiannessArgToByteOrder b = if b then BigEndian else LittleEndian
-
-{- Note [Manual specialization]
-For both integerToByteString and byteStringToInteger, we have to perform very
-similar operations, but with small variations:
-
-- Most-significant-first versus most-significant-last (for both)
-- Whether we have a size limit or not (for integerToByteString)
-
-Additionally, loop sectioning (see Note [Loop sectioning]) requires us to have
-separate 'big-stride' and 'small-stride' operations to ensure universality of
-input handling. Lastly, we have several subroutines (digit extraction, for
-instance) that may vary in similar ways. In such a case, generalization by
-means of abstraction seems like a good idea, as the operations (and
-subroutines) vary little.
-
-At the same time, to determine which variation of any given function (or
-subroutine) we need, we only have to scrutinize the relevant argument(s) once:
-these specifics (such as byte order) don't change during the course of the
-operation. Thus, we want to make sure that these checks in the code are _also_
-performed only once, ideally at the beginning.
-
-However, if we write such operations naively as so:
-
-> subroutine byteOrder arg1 arg2 = case byteOrder of
->       LittleEndian -> ...
->       BigEndian -> ...
-
-the byteOrder argument will be scrutinized on each call of subroutine. This is
-correct in general (as there is no guarantee that the argument will be stable).
-Strangely, however, even in a case like this one:
-
-> mainRoutine byteOrder arg1 arg2 = ...
->    where
->       subroutine arg3 = case byteOrder of
->           LittleEndian -> ...
->           BigEndian -> ...
-
-GHC _still_ re-scrutinizes byteOrder in every call of subroutine! This penalty
-can be somewhat lessened using a form similar to this:
-
-> mainRoutine byteOrder arg1 arg2 = ...
->     where
->        !subroutine = case byteOrder of
->             LittleEndian -> \arg3 -> ...
->             BigEndian -> \arg3 -> ...
-
-but this is _still_ between 20 and 30% worse than doing something like this:
-
-> mainRoutine byteOrder arg1 arg2 = case byteOrder of
->     LittleEndian -> [code calling subroutineLE where needed]
->     BigEndian -> [code calling subroutineBE where needed]
->     where
->         subroutineLE arg3 = ...
->         subroutineBE arg3 = ...
-
-This form _ensures_ we scrutinize (and branch) only the number of times we have
-to, and in a predictable place. Since these are backends for Plutus Core primops,
-and performance is thus critical, we choose to use this manually-specialized form
-for each combination of relevant arguments. While this is repetitive, and thus
-also somewhat error-prone, the performance penalty for not doing this is
-unacceptable.
--}
-
-{- Note [Loop sectioning]
-Both integerToByteString and byteStringToInteger effectively function as loops
-over digits (and thus, individual bytes), which either have to be read or
-extracted. In particular, this involves trafficking data between memory and
-machine registers (both ByteString and Integer are wrappers around counted
-arrays), as well as the overhead of looping (involving comparisons and branches).
-
-However, on all architectures of interest (essentially, 64-bit Tier 1),
-general-purpose registers (GPRs henceforth) are 64 bits (or 8 bytes).
-Furthermore, the primary cost of moving data between memory and registers is
-having to overcome the 'memory wall': the exact amount of data being moved
-doesn't affect this very much. In addition to this, when we operate on single
-bytes, the remaining 56 bits of the GPR holding that data are essentially
-'wasted'. In the situation we have (namely, operating over arrays, whose data
-is adjacent in memory), we thus get two sources of inefficiency:
-
-* Despite paying the cost for a memory transfer, we transfer only one-eighth
-  the data we could; and
-* Despite transferring data from memory to registers, we utilize the register
-  at only one-eighth capacity.
-
-This essentially means we perform _eight times_ more rotations of the loop,
-and memory moves, than we need to!
-
-To avoid this inefficiency, we use a technique known as _loop sectioning_.
-Effectively, this turns our homogenous loop (that always operates one byte at
-a time) into a heterogenous loop: first, we operate on a larger section (called
-a _stride_) until we can no longer do this, and then we finish up using byte
-at a time processing. Essentially, when given an input like this:
-
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-the homogeous byte-at-a-time approach would process it like so:
-
-  _   _   _   _   _   _   _   _   _   _
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-for a total of 10 memory transfers and 10 loop spins, whereas a loop-sectioned
-approach with a stride of 8 would instead process like so:
-
-  ______________________________  _   _
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-Giving us only _three_ memory transfers and _three_ loop spins instead. This
-effectively reduces our work by a factor of 8. In our cases, this is almost
-free, as there is no data processing to be done: all we need to do is copy
-from one place to another, essentially.
-
-This technique only benefits us because counted arrays are cache-friendly: see
-Note [Superscalarity and caching] for a longer explanation of this and why it
-matters.
-
-Further information:
-
-- Tier 1 GHC platform list:
-    https://gitlab.haskell.org/ghc/ghc/-/wikis/platforms#tier-1-platforms
-- Memory wall:
-    https://link.springer.com/referenceworkentry/10.1007/978-0-387-09766-4_234
-- Loop sectioning in more detail:
-    http://physics.ujep.cz/~zmoravec/prga/main_for/mergedProjects/optaps_for/common/optaps_vec_mine.htm
--}
-
-{- Note [Superscalarity and caching]
-On modern architectures, in order to process data, it must first be moved from
-memory into a register. This operation has some cost (known as the 'memory wall'),
-which is largely independent of how much data gets moved (assuming the register
-can hold it): moving one byte, or a whole register's worth, costs about the same.
-To reduce this cost, CPU manufacturers have introduced _cache hierarchies_,
-which are designed to limit the cost of the wall, as long as the data access
-matches the cache's optimal usage pattern. Thus, while an idealized view of
-the memory hierachy is this:
-
-Registers
----------
-Memory
-
-in reality, the view is more like this:
-
-Registers
----------
-L1 cache
----------
-L2 cache
----------
-L3 cache (on some platforms)
----------
-Memory
-
-Each 'higher' cache in the hierarchy is smaller, but faster, and when a memory
-fetch is requested in code, in addition to moving the requested data to a
-register, that data (plus some more) is moved into caches as well. The amount
-of data moved into cache (a _cache line_) is typically eight machine words on
-modern architectures (and definitely is the case for all Tier 1 GHC platforms):
-for the cases concerning Plutus, that is 64 bytes. Therefore, if data we need
-soon after a fetch is _physically_ nearby, it won't need to be fetched from
-memory: instead, it would come from a cache, which is faster (by a considerable
-margin).
-
-To see how this can matter, consider the following ByteString:
-
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-The ByteString (being a counted array) has all of its data physically adjacent
-to each other. Suppose we wanted to fetch the byte at index 1 (second position).
-The naive view of what happens is like this:
-
-Registers: [b2] [ ] [ ] .... [ ]
-Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-Thus, it would appear that, if we wanted a different position's value, we would
-need to fetch from memory again. However, what _actually_ happens is more like this:
-
-Registers: [b2] [ ] [ ] .... [ ]
-L1 cache: [ b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-We note that b2, as well as its adjacent elements, were _all_ pulled into the L1
-cache. This can only work because all these elements are physically adjacent in
-memory. The improvement in performance from this cache use is _very_ non-trivial:
-an L1 cache is about 200 times faster than a memory access, and an L2 cache about
-20 times faster.
-
-To take further advantage of this, modern CPUs (and all Tier 1 GHC platforms have
-this capability) are _superscalar_. To explain what this means, let's consider the
-naive view of how CPUs execute instructions: namely, it is one-at-a-time, and
-synchronous. While CPUs must give the _appearance_ that they behave this way, in
-practice, CPU execution is very much asynchronous: due to the proliferation of ALUs
-on a single chip, having twice as many processing units is much cheaper than having
-processing units run twice as fast. Thus, if there are no data dependencies
-between instructions, CPUs can (and do!) execute them simultaneously, stalling to
-await results if a data dependency is detected. This can be done automatically
-using Tomasulo's algorithm, which ensures no conflicts with maximum throughput.
-
-Superscalarity interacts well with the cache hierarchy, as it makes data more
-easily available for processing, provided there is enough 'work to do', and no
-data dependencies. In our situation, most of what we do is data _movement_ from
-one memory location to another, which by its very nature lacks any data
-dependencies.
-
-Further references:
-
-- Numbers for cache and memory transfers: https://gist.github.com/jboner/2841832
-- Superscalarity: https://en.wikipedia.org/wiki/Superscalar_processor
-- Tomasulo's algorithm: https://en.wikipedia.org/wiki/Tomasulo%27s_algorithm
--}
diff --git a/plutus-core/plutus-core/src/PlutusCore/Bitwise/Logical.hs b/plutus-core/plutus-core/src/PlutusCore/Bitwise/Logical.hs
deleted file mode 100644
index 7e228ad80ab..00000000000
--- a/plutus-core/plutus-core/src/PlutusCore/Bitwise/Logical.hs
+++ /dev/null
@@ -1,464 +0,0 @@
--- editorconfig-checker-disable-file
-
-{-# LANGUAGE OverloadedStrings #-}
-
--- | Implementations of bitwise logical primops.
-module PlutusCore.Bitwise.Logical (
-  andByteString,
-  orByteString,
-  xorByteString,
-  complementByteString,
-  readBit,
-  writeBits,
-  replicateByteString
-  ) where
-
-import Control.Exception (Exception, throw, try)
-import Data.Bits qualified as Bits
-import Data.ByteString (ByteString)
-import Data.ByteString qualified as BS
-import Data.ByteString.Internal qualified as BSI
-import Data.Foldable (for_, traverse_)
-import Data.Text (pack)
-import Data.Word (Word64, Word8)
-import Foreign.Marshal.Utils (copyBytes)
-import Foreign.Ptr (Ptr, castPtr, plusPtr)
-import Foreign.Storable (peekByteOff, peekElemOff, pokeByteOff, pokeElemOff)
-import PlutusCore.Builtin (BuiltinResult, emit)
-import PlutusCore.Evaluation.Result (evaluationFailure)
-import System.IO.Unsafe (unsafeDupablePerformIO)
-
-{- Note [Binary bitwise operation implementation and manual specialization]
-
-   All of the 'binary' bitwise operations (namely `andByteString`,
-   `orByteString` and `xorByteString`) operate similarly:
-
-   1. Decide which of their two `ByteString` arguments determines the length
-      of the result. For padding semantics, this is the _longer_ argument,
-      whereas for truncation semantics, it's the _shorter_ one. If both
-      `ByteString` arguments have identical length, it doesn't matter which we
-      choose.
-   2. Copy the choice made in step 1 into a fresh mutable buffer.
-   3. Traverse over each byte of the argument _not_ chosen in step 1, and
-      combine each of those bytes with the byte at the corresponding index of
-      the fresh mutable buffer from step 2 (`.&.` for `andByteString`,
-      `.|.` for `orByteString`, `xor` for `xorByteString`).
-
-  We also make use of loop sectioning to optimize this operation: see Note
-  [Loop sectioning] explaining why we do this. Fundamentally, this doesn't
-  change the logic of the operation, but means that step 3 is split into
-  two smaller sub-steps: we first word 8 bytes at a time, then one byte at a
-  time to finish up if necessary. Other than the choice of 'combining
-  operation', the structure of the computation is the same, which suggests that
-  we want a helper function with a signature like
-
-  helper1 ::
-    (Word64 -> Word64 -> Word64) ->
-    (Word8 -> Word8 -> Word8) ->
-    ByteString ->
-    ByteString ->
-    Int ->
-    ByteString
-
-  or possibly (to avoid duplicate argument passing) like
-
-  helper2 ::
-    (forall (a :: Type) . Bits a => a -> a -> a) ->
-    ByteString ->
-    ByteString ->
-    Int ->
-    ByteString
-
-  This would allow us to share all this logic, and have each of the 'top-level'
-  operations just dispatch to either of the helpers with the appropriate
-  function argument(s). Instead, we chose to write a manual copy of this logic
-  for each of the 'top-level' operations, substituting only the 'combining
-  operation'.
-
-  We made this choice as any design based on either `helper1` or `helper2` is
-  significantly slower (at least 50% worse, and the penalty _percentage_ grows
-  with argument size). While `helper2` is significantly more penalizing than
-  `helper1`, even `helper1` reaches an almost threefold slowdown at the higher
-  input sizes we are interested in relative the manual version we use here.
-  Due to the 'low-level' nature of Plutus Core primops, we consider these costs
-  unacceptable relative the (small) benefits to code clarity and maintainability
-  any solution using either style of helper would provide.
-
-  The reason for `helper2` under-performing is unsurprising: any argument whose
-  type is rank-2 polymorphic with a dictionary constraint essentially acts as
-  a 'program template', which gets interpreted at runtime given some dictionary
-  for a `Bits` instance. GHC can do practically nothing to optimize this, as
-  there is no way to tell, for any given argument, _which_ definitions of an
-  instance would be required here, even if the set of operations we use is
-  finite, since any instance can make use of the full power of Haskell, which
-  essentially lands us in Rice's Theorem territory. For `helper1`, the reasons
-  are similar: it _must_ be able to work regardless of what functions (assuming
-  appropriate types) it is given, which means in general, GHC is forced to
-  compile mother-may-I-style code involving pointer chasing those arguments at
-  runtime. This explains why the 'blowup' becomes worse with argument length.
-
-  While in theory inlining could help with at least the `helper1` case (
-  `helper2` is beyond that technique), it doesn't seem like GHC is able to
-  figure this out, even with `INLINE` is placed on `helper1`.
-  -}
-
--- | Bitwise logical AND, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md).
-{-# INLINEABLE andByteString #-}
-andByteString :: Bool -> ByteString -> ByteString -> ByteString
-andByteString shouldPad bs1 bs2 =
-  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
-      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
-   in go toCopy toTraverse (BS.length shorter)
-  where
-    go :: ByteString -> ByteString -> Int -> ByteString
-    go toCopy toTraverse traverseLen =
-      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
-        BS.useAsCString toTraverse $ \traversePtr -> do
-          BSI.create copyLen $ \dstPtr -> do
-            copyBytes dstPtr (castPtr copyPtr) copyLen
-            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
-            let offset = bigStrides * 8
-            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
-            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
-            for_ [0 .. bigStrides - 1] $ \i -> do
-              w64_1 <- peekElemOff bigDstPtr i
-              w64_2 <- peekElemOff bigTraversePtr i
-              pokeElemOff bigDstPtr i $ w64_1 Bits..&. w64_2
-            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
-            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
-            for_ [0 .. littleStrides - 1] $ \i -> do
-              w8_1 <- peekElemOff smallDstPtr i
-              w8_2 <- peekElemOff smallTraversePtr i
-              pokeElemOff smallDstPtr i $ w8_1 Bits..&. w8_2
-
--- | Bitwise logical OR, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md).
-{-# INLINEABLE orByteString #-}
-orByteString :: Bool -> ByteString -> ByteString -> ByteString
-orByteString shouldPad bs1 bs2 =
-  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
-      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
-   in go toCopy toTraverse (BS.length shorter)
-  where
-    go :: ByteString -> ByteString -> Int -> ByteString
-    go toCopy toTraverse traverseLen =
-      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
-        BS.useAsCString toTraverse $ \traversePtr -> do
-          BSI.create copyLen $ \dstPtr -> do
-            copyBytes dstPtr (castPtr copyPtr) copyLen
-            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
-            let offset = bigStrides * 8
-            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
-            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
-            for_ [0 .. bigStrides - 1] $ \i -> do
-              w64_1 <- peekElemOff bigDstPtr i
-              w64_2 <- peekElemOff bigTraversePtr i
-              pokeElemOff bigDstPtr i $ w64_1 Bits..|. w64_2
-            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
-            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
-            for_ [0 .. littleStrides - 1] $ \i -> do
-              w8_1 <- peekElemOff smallDstPtr i
-              w8_2 <- peekElemOff smallTraversePtr i
-              pokeElemOff smallDstPtr i $ w8_1 Bits..|. w8_2
-
--- | Bitwise logical XOR, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md).
-{-# INLINEABLE xorByteString #-}
-xorByteString :: Bool -> ByteString -> ByteString -> ByteString
-xorByteString shouldPad bs1 bs2 =
-  let (shorter, longer) = if BS.length bs1 < BS.length bs2 then (bs1, bs2) else (bs2, bs1)
-      (toCopy, toTraverse) = if shouldPad then (longer, shorter) else (shorter, longer)
-   in go toCopy toTraverse (BS.length shorter)
-  where
-    go :: ByteString -> ByteString -> Int -> ByteString
-    go toCopy toTraverse traverseLen =
-      unsafeDupablePerformIO . BS.useAsCStringLen toCopy $ \(copyPtr, copyLen) ->
-        BS.useAsCString toTraverse $ \traversePtr -> do
-          BSI.create copyLen $ \dstPtr -> do
-            copyBytes dstPtr (castPtr copyPtr) copyLen
-            let (bigStrides, littleStrides) = traverseLen `quotRem` 8
-            let offset = bigStrides * 8
-            let bigDstPtr :: Ptr Word64 = castPtr dstPtr
-            let bigTraversePtr :: Ptr Word64 = castPtr traversePtr
-            for_ [0 .. bigStrides - 1] $ \i -> do
-              w64_1 <- peekElemOff bigDstPtr i
-              w64_2 <- peekElemOff bigTraversePtr i
-              pokeElemOff bigDstPtr i $ Bits.xor w64_1 w64_2
-            let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
-            let smallTraversePtr :: Ptr Word8 = plusPtr traversePtr offset
-            for_ [0 .. littleStrides - 1] $ \i -> do
-              w8_1 <- peekElemOff smallDstPtr i
-              w8_2 <- peekElemOff smallTraversePtr i
-              pokeElemOff smallDstPtr i $ Bits.xor w8_1 w8_2
-
--- | Bitwise logical complement, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md).
-{-# INLINEABLE complementByteString #-}
-complementByteString :: ByteString -> ByteString
-complementByteString bs = unsafeDupablePerformIO . BS.useAsCStringLen bs $ \(srcPtr, len) -> do
-  -- We use loop sectioning here; see Note [Loop sectioning] as to why we do this
-  let (bigStrides, littleStrides) = len `quotRem` 8
-  let offset = bigStrides * 8
-  BSI.create len $ \dstPtr -> do
-    let bigSrcPtr :: Ptr Word64 = castPtr srcPtr
-    let bigDstPtr :: Ptr Word64 = castPtr dstPtr
-    for_ [0 .. bigStrides - 1] $ \i -> do
-      w64 <- peekElemOff bigSrcPtr i
-      pokeElemOff bigDstPtr i . Bits.complement $ w64
-    let smallSrcPtr :: Ptr Word8 = plusPtr srcPtr offset
-    let smallDstPtr :: Ptr Word8 = plusPtr dstPtr offset
-    for_ [0 .. littleStrides - 1] $ \i -> do
-      w8 <- peekElemOff smallSrcPtr i
-      pokeElemOff smallDstPtr i . Bits.complement $ w8
-
--- | Bit read at index, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md)
-{-# INLINEABLE readBit #-}
-readBit :: ByteString -> Int -> BuiltinResult Bool
-readBit bs ix
-  | ix < 0 = do
-      emit "readBit: index out of bounds"
-      emit $ "Index: " <> (pack . show $ ix)
-      evaluationFailure
-  | ix >= len * 8 = do
-      emit "readBit: index out of bounds"
-      emit $ "Index: " <> (pack . show $ ix)
-      evaluationFailure
-  | otherwise = do
-      let (bigIx, littleIx) = ix `quotRem` 8
-      let flipIx = len - bigIx - 1
-      pure $ Bits.testBit (BS.index bs flipIx) littleIx
-  where
-    len :: Int
-    len = BS.length bs
-
--- | Bulk bit write, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md)
-{-# INLINEABLE writeBits #-}
-writeBits :: ByteString -> [(Integer, Bool)] -> BuiltinResult ByteString
-writeBits bs changelist = case unsafeDupablePerformIO . try $ go of
-  Left (WriteBitsException i) -> do
-    emit "writeBits: index out of bounds"
-    emit $ "Index: " <> (pack . show $ i)
-    evaluationFailure
-  Right result -> pure result
-  where
-    -- This is written in a somewhat strange way. See Note [writeBits and
-    -- exceptions], which covers why we did this.
-    go :: IO ByteString
-    go = BS.useAsCString bs $ \srcPtr ->
-          BSI.create len $ \dstPtr -> do
-            copyBytes dstPtr (castPtr srcPtr) len
-            traverse_ (setAtIx dstPtr) changelist
-    len :: Int
-    len = BS.length bs
-    bitLen :: Integer
-    bitLen = fromIntegral len * 8
-    setAtIx :: Ptr Word8 -> (Integer, Bool) -> IO ()
-    setAtIx ptr (i, b)
-      | i < 0 = throw $ WriteBitsException i
-      | i >= bitLen = throw $ WriteBitsException i
-      | otherwise = do
-          let (bigIx, littleIx) = i `quotRem` 8
-          let flipIx = len - fromIntegral bigIx - 1
-          w8 :: Word8 <- peekByteOff ptr flipIx
-          let toWrite = if b
-                        then Bits.setBit w8 . fromIntegral $ littleIx
-                        else Bits.clearBit w8 . fromIntegral $ littleIx
-          pokeByteOff ptr flipIx toWrite
-
--- | Byte replication, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md)
-replicateByteString :: Int -> Word8 -> BuiltinResult ByteString
-replicateByteString len w8
-  | len < 0 = do
-      emit "byteStringReplicate: negative length requested"
-      evaluationFailure
-  | otherwise = pure . BS.replicate len $ w8
-
--- Helpers
-
-{- Note [writeBits and exceptions]
-
-   As `writeBits` allows us to pass a changelist argument of any length, we
-   potentially could have an out-of-bounds index anywhere in the list. As we
-   have to fail on such cases (and report them appropriately), we end up needing
-   _both_ IO (to do mutable things) as well as a way to signal errors. We can
-   do this in two ways:
-
-   1. Pre-scan the changelist for any out-of-bounds indexes, fail if we see any,
-      then apply the necessary changes if no out-of-bounds indexes are found.
-   2. Speculatively allocate the new `ByteString`, then do the changes in the
-      changelist argument one at a time, failing as soon as we see an out-of-bounds
-      index.
-
-  Option 1 would require traversing the changelist argument twice, which is
-  undesirable, which means that option 2 is the more efficient choice. The
-  natural choice for option 2 would be something similar to `ExceptT Int IO`
-  (with the `Int` being an out-of-bounds index). However, we aren't able to do
-  this, as ultimately, `ByteString`s are implemented as `ForeignPtr`s, forcing
-  us to use the following function to interact with them, directly or not:
-
-  withForeignPtr :: forall (a :: Type) . ForeignPtr a -> (Ptr a -> IO b) -> IO b
-
-  Notably, the function argument produces a result of `IO b`, whereas we would
-  need `MonadIO m => m b` instead. This means that our _only_ choice is to
-  use the exception mechanism, either directly or via some wrappers like
-  `MonadUnliftIO`. While this is unusual, and arguably against the spirit of
-  the use of `IO` relative `ByteString` construction, we don't have any other
-  choice. We decided to use the exception mechanism directly, as while
-  `MonadUnliftIO` is a bit cleaner, it ultimately ends up doing the same thing
-  anyway, and this method at least makes it clear what we're doing.
-
-  This doesn't pose any problems from the point of view of Plutus Core, as this
-  exception cannot 'leak'; we handle it entirely within `writeBits`, and no
-  other Plutus Core code can ever see it.
--}
-newtype WriteBitsException = WriteBitsException Integer
-  deriving stock (Eq, Show)
-
-instance Exception WriteBitsException
-
-{- Note [Loop sectioning]
-
-Several operations in this module effectively function as loops over bytes,
-which have to be read, written, or both. Furthermore, we usually need to
-process these bytes somehow, typically using fixed-width bitwise operations
-from the Haskell side, thus allowing us to 'translate' these same operations
-to the variable-width `ByteString` arguments we are dealing with. This involves
-significant trafficking of data between memory and machine registers (as
-`ByteString`s are wrapped counted arrays), as well as the overheads of looping
-(involving comparisons and branches). This trafficking is necessary not only
-to move the memory around, but also to process it, as on modern architectures,
-data must first be moved into a register in order to do anything with it.
-
-On all architectures of interest (essentially, 64-bit Tier 1), general-purpose
-registers (GPRs henceforth) are 64 bits (or 8 bytes) wide. Furthermore, the
-primary cost of moving data between memory and registers is having to overcome
-the 'memory wall': the exact amount of data being moved doesn't affect this
-much. In addition to this, when we operate on single bytes, the remaining 56
-bits of the GPR holding that data are essentially 'wasted'. In the situation
-we are in (namely, operating over arrays, whose data is adjacent in memory),
-we thus get two sources of inefficiency:
-
-* Despite paying the cost for a memory transfer, we move only one-eighth of
-  the data we could; and
-* Despite transferring data from memory to registers, we use these registers
-  only at one-eighth capacity.
-
-In short, we do _eight times_ more rotations of the loop, and memory moves,
-than we need to!
-
-To avoid this, we use a technique called _loop sectioning_. Effectively, this
-transforms our homogenous loop (that always works one byte at a time) into a
-heterogenous loop: first, we operate on a larger section (called a _stride_)
-until we can no longer do this, and then we finish up using byte at a time
-processing. Essentially, given an input like this:
-
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-the homogeous byte-at-a-time approach would process it like so:
-
-  _   _   _   _   _   _   _   _   _   _
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-for a total of 10 memory transfers and 10 loop spins, whereas a loop-sectioned
-approach with a stride of 8 would instead process like so:
-
-  ______________________________  _   _
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10 ]
-
-This gives us only _three_ memory transfers and _three_ loop spins instead. This
-effectively reduces our work by a factor of 8. In our cases, this is significant.
-
-This technique only benefits us because counted arrays are cache-friendly: see
-Note [Superscalarity and caching] for a longer explanation of this and why it
-matters.
-
-Further information:
-
-- Tier 1 GHC platform list:
-    https://gitlab.haskell.org/ghc/ghc/-/wikis/platforms#tier-1-platforms
-- Memory wall:
-    https://link.springer.com/referenceworkentry/10.1007/978-0-387-09766-4_234
-- Loop sectioning in more detail:
-    http://physics.ujep.cz/~zmoravec/prga/main_for/mergedProjects/optaps_for/common/optaps_vec_mine.htm
--}
-
-{- Note [Superscalarity and caching]
-On modern architectures, in order to process data, it must first be moved from
-memory into a register. This operation has some cost (known as the 'memory wall'),
-which is largely independent of how much data gets moved (assuming the register
-can hold it): moving one byte, or a whole register's worth, costs about the same.
-To reduce this cost, CPU manufacturers have introduced _cache hierarchies_,
-which are designed to limit the cost of the wall, as long as the data access
-matches the cache's optimal usage pattern. Thus, while an idealized view of
-the memory hierachy is this:
-
-Registers
----------
-Memory
-
-in reality, the view is more like this:
-
-Registers
----------
-L1 cache
----------
-L2 cache
----------
-L3 cache (on some platforms)
----------
-Memory
-
-Each 'higher' cache in the hierarchy is smaller, but faster, and when a memory
-fetch is requested in code, in addition to moving the requested data to a
-register, that data (plus some more) is moved into caches as well. The amount
-of data moved into cache (a _cache line_) is typically eight machine words on
-modern architectures (and definitely is the case for all Tier 1 GHC platforms):
-for the cases concerning Plutus, that is 64 bytes. Therefore, if data we need
-soon after a fetch is _physically_ nearby, it won't need to be fetched from
-memory: instead, it would come from a cache, which is faster (by a considerable
-margin).
-
-To see how this can matter, consider the following ByteString:
-
-[ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-The ByteString (being a counted array) has all of its data physically adjacent
-to each other. Suppose we wanted to fetch the byte at index 1 (second position).
-The naive view of what happens is like this:
-
-Registers: [b2] [ ] [ ] .... [ ]
-Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-Thus, it would appear that, if we wanted a different position's value, we would
-need to fetch from memory again. However, what _actually_ happens is more like this:
-
-Registers: [b2] [ ] [ ] .... [ ]
-L1 cache: [ b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-Memory: [ b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11 ]
-
-We note that b2, as well as its adjacent elements, were _all_ pulled into the L1
-cache. This can only work because all these elements are physically adjacent in
-memory. The improvement in performance from this cache use is _very_ non-trivial:
-an L1 cache is about 200 times faster than a memory access, and an L2 cache about
-20 times faster.
-
-To take further advantage of this, modern CPUs (and all Tier 1 GHC platforms have
-this capability) are _superscalar_. To explain what this means, let's consider the
-naive view of how CPUs execute instructions: namely, it is one-at-a-time, and
-synchronous. While CPUs must give the _appearance_ that they behave this way, in
-practice, CPU execution is very much asynchronous: due to the proliferation of ALUs
-on a single chip, having twice as many processing units is much cheaper than having
-processing units run twice as fast. Thus, if there are no data dependencies
-between instructions, CPUs can (and do!) execute them simultaneously, stalling to
-await results if a data dependency is detected. This can be done automatically
-using Tomasulo's algorithm, which ensures no conflicts with maximum throughput.
-
-Superscalarity interacts well with the cache hierarchy, as it makes data more
-easily available for processing, provided there is enough 'work to do', and no
-data dependencies. In our situation, most of what we do is data _movement_ from
-one memory location to another, which by its very nature lacks any data
-dependencies.
-
-Further references:
-
-- Numbers for cache and memory transfers: https://gist.github.com/jboner/2841832
-- Superscalarity: https://en.wikipedia.org/wiki/Superscalar_processor
-- Tomasulo's algorithm: https://en.wikipedia.org/wiki/Tomasulo%27s_algorithm
--}
diff --git a/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs b/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
index a34d129237f..ecc6bc4f5f0 100644
--- a/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
+++ b/plutus-core/plutus-core/src/PlutusCore/Default/Builtins.hs
@@ -7,7 +7,6 @@
 {-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE OverloadedStrings     #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE TypeApplications      #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE UndecidableInstances  #-}
@@ -26,8 +25,7 @@ import PlutusCore.Evaluation.Machine.ExMemoryUsage (ExMemoryUsage, LiteralByteSi
 import PlutusCore.Evaluation.Result (EvaluationResult (..))
 import PlutusCore.Pretty (PrettyConfigPlc)
 
-import PlutusCore.Bitwise.Convert as Convert
-import PlutusCore.Bitwise.Logical as Logical
+import PlutusCore.Bitwise qualified as Bitwise
 import PlutusCore.Crypto.BLS12_381.G1 qualified as BLS12_381.G1
 import PlutusCore.Crypto.BLS12_381.G2 qualified as BLS12_381.G2
 import PlutusCore.Crypto.BLS12_381.Pairing qualified as BLS12_381.Pairing
@@ -160,7 +158,12 @@ data DefaultFun
     | ComplementByteString
     | ReadBit
     | WriteBits
-    | ReplicateByteString
+    | ReplicateByte
+    -- Bitwise
+    | ShiftByteString
+    | RotateByteString
+    | CountSetBits
+    | FindFirstSetBit
     deriving stock (Show, Eq, Ord, Enum, Bounded, Generic, Ix)
     deriving anyclass (NFData, Hashable, PrettyBy PrettyConfigPlc)
 
@@ -1817,7 +1820,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
         let integerToByteStringDenotation :: Bool -> LiteralByteSize -> Integer -> BuiltinResult BS.ByteString
             {- The second argument is wrapped in a LiteralByteSize to allow us to interpret it as a size during
                costing.  It appears as an integer in UPLC: see Note [Integral types as Integer]. -}
-            integerToByteStringDenotation b (LiteralByteSize w) n = integerToByteStringWrapper b w n
+            integerToByteStringDenotation b (LiteralByteSize w) = Bitwise.integerToByteStringWrapper b w
             {-# INLINE integerToByteStringDenotation #-}
         in makeBuiltinMeaning
             integerToByteStringDenotation
@@ -1825,7 +1828,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar ByteStringToInteger =
         let byteStringToIntegerDenotation :: Bool -> BS.ByteString -> Integer
-            byteStringToIntegerDenotation = byteStringToIntegerWrapper
+            byteStringToIntegerDenotation = Bitwise.byteStringToIntegerWrapper
             {-# INLINE byteStringToIntegerDenotation #-}
         in makeBuiltinMeaning
             byteStringToIntegerDenotation
@@ -1834,7 +1837,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
     -- Logical
     toBuiltinMeaning _semvar AndByteString =
         let andByteStringDenotation :: Bool -> BS.ByteString -> BS.ByteString -> BS.ByteString
-            andByteStringDenotation = Logical.andByteString
+            andByteStringDenotation = Bitwise.andByteString
             {-# INLINE andByteStringDenotation #-}
         in makeBuiltinMeaning
             andByteStringDenotation
@@ -1842,7 +1845,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar OrByteString =
         let orByteStringDenotation :: Bool -> BS.ByteString -> BS.ByteString -> BS.ByteString
-            orByteStringDenotation = Logical.orByteString
+            orByteStringDenotation = Bitwise.orByteString
             {-# INLINE orByteStringDenotation #-}
         in makeBuiltinMeaning
             orByteStringDenotation
@@ -1850,7 +1853,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar XorByteString =
         let xorByteStringDenotation :: Bool -> BS.ByteString -> BS.ByteString -> BS.ByteString
-            xorByteStringDenotation = Logical.xorByteString
+            xorByteStringDenotation = Bitwise.xorByteString
             {-# INLINE xorByteStringDenotation #-}
         in makeBuiltinMeaning
             xorByteStringDenotation
@@ -1858,7 +1861,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar ComplementByteString =
         let complementByteStringDenotation :: BS.ByteString -> BS.ByteString
-            complementByteStringDenotation = Logical.complementByteString
+            complementByteStringDenotation = Bitwise.complementByteString
             {-# INLINE complementByteStringDenotation #-}
         in makeBuiltinMeaning
             complementByteStringDenotation
@@ -1866,7 +1869,7 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar ReadBit =
         let readBitDenotation :: BS.ByteString -> Int -> BuiltinResult Bool
-            readBitDenotation = Logical.readBit
+            readBitDenotation = Bitwise.readBit
             {-# INLINE readBitDenotation #-}
         in makeBuiltinMeaning
             readBitDenotation
@@ -1874,20 +1877,54 @@ instance uni ~ DefaultUni => ToBuiltinMeaning uni DefaultFun where
 
     toBuiltinMeaning _semvar WriteBits =
         let writeBitsDenotation :: BS.ByteString -> [(Integer, Bool)] -> BuiltinResult BS.ByteString
-            writeBitsDenotation = Logical.writeBits
+            writeBitsDenotation = Bitwise.writeBits
             {-# INLINE writeBitsDenotation #-}
         in makeBuiltinMeaning
             writeBitsDenotation
             (runCostingFunTwoArguments . unimplementedCostingFun)
 
-    toBuiltinMeaning _semvar ReplicateByteString =
-        let byteStringReplicateDenotation :: Int -> Word8 -> BuiltinResult BS.ByteString
-            byteStringReplicateDenotation = Logical.replicateByteString
-            {-# INLINE byteStringReplicateDenotation #-}
+    toBuiltinMeaning _semvar ReplicateByte =
+        let replicateByteDenotation :: Int -> Word8 -> BuiltinResult BS.ByteString
+            replicateByteDenotation = Bitwise.replicateByte
+            {-# INLINE replicateByteDenotation #-}
         in makeBuiltinMeaning
-            byteStringReplicateDenotation
+            replicateByteDenotation
             (runCostingFunTwoArguments . unimplementedCostingFun)
 
+    -- Bitwise
+
+    toBuiltinMeaning _semvar ShiftByteString =
+        let shiftByteStringDenotation :: BS.ByteString -> Int -> BS.ByteString
+            shiftByteStringDenotation = Bitwise.shiftByteString
+            {-# INLINE shiftByteStringDenotation #-}
+        in makeBuiltinMeaning
+            shiftByteStringDenotation
+            (runCostingFunTwoArguments . unimplementedCostingFun)
+
+    toBuiltinMeaning _semvar RotateByteString =
+        let rotateByteStringDenotation :: BS.ByteString -> Int -> BS.ByteString
+            rotateByteStringDenotation = Bitwise.rotateByteString
+            {-# INLINE rotateByteStringDenotation #-}
+        in makeBuiltinMeaning
+            rotateByteStringDenotation
+            (runCostingFunTwoArguments . unimplementedCostingFun)
+
+    toBuiltinMeaning _semvar CountSetBits =
+        let countSetBitsDenotation :: BS.ByteString -> Int
+            countSetBitsDenotation = Bitwise.countSetBits
+            {-# INLINE countSetBitsDenotation #-}
+        in makeBuiltinMeaning
+            countSetBitsDenotation
+            (runCostingFunOneArgument . unimplementedCostingFun)
+
+    toBuiltinMeaning _semvar FindFirstSetBit =
+        let findFirstSetBitDenotation :: BS.ByteString -> Int
+            findFirstSetBitDenotation = Bitwise.findFirstSetBit
+            {-# INLINE findFirstSetBitDenotation #-}
+        in makeBuiltinMeaning
+            findFirstSetBitDenotation
+            (runCostingFunOneArgument . unimplementedCostingFun)
+
     -- See Note [Inlining meanings of builtins].
     {-# INLINE toBuiltinMeaning #-}
 
@@ -2021,7 +2058,12 @@ instance Flat DefaultFun where
               ComplementByteString            -> 78
               ReadBit                         -> 79
               WriteBits                       -> 80
-              ReplicateByteString             -> 81
+              ReplicateByte                   -> 81
+
+              ShiftByteString                 -> 82
+              RotateByteString                -> 83
+              CountSetBits                    -> 84
+              FindFirstSetBit                 -> 85
 
     decode = go =<< decodeBuiltin
         where go 0  = pure AddInteger
@@ -2105,7 +2147,11 @@ instance Flat DefaultFun where
               go 78 = pure ComplementByteString
               go 79 = pure ReadBit
               go 80 = pure WriteBits
-              go 81 = pure ReplicateByteString
+              go 81 = pure ReplicateByte
+              go 82 = pure ShiftByteString
+              go 83 = pure RotateByteString
+              go 84 = pure CountSetBits
+              go 85 = pure FindFirstSetBit
               go t  = fail $ "Failed to decode builtin tag, got: " ++ show t
 
     size _ n = n + builtinTagWidth
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/CountSetBits.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/CountSetBits.plc.golden
new file mode 100644
index 00000000000..aa49a117436
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/CountSetBits.plc.golden
@@ -0,0 +1 @@
+bytestring -> integer
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/FindFirstSetBit.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/FindFirstSetBit.plc.golden
new file mode 100644
index 00000000000..aa49a117436
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/FindFirstSetBit.plc.golden
@@ -0,0 +1 @@
+bytestring -> integer
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ReplicateByte.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ReplicateByte.plc.golden
new file mode 100644
index 00000000000..fcb192a96ed
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ReplicateByte.plc.golden
@@ -0,0 +1 @@
+integer -> integer -> bytestring
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/RotateByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/RotateByteString.plc.golden
new file mode 100644
index 00000000000..fbda7bdf852
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/RotateByteString.plc.golden
@@ -0,0 +1 @@
+bytestring -> integer -> bytestring
\ No newline at end of file
diff --git a/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ShiftByteString.plc.golden b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ShiftByteString.plc.golden
new file mode 100644
index 00000000000..fbda7bdf852
--- /dev/null
+++ b/plutus-core/plutus-core/test/TypeSynthesis/Golden/DefaultFun/ShiftByteString.plc.golden
@@ -0,0 +1 @@
+bytestring -> integer -> bytestring
\ No newline at end of file
diff --git a/plutus-core/plutus-ir/src/PlutusIR/Transform/RewriteRules/CommuteFnWithConst.hs b/plutus-core/plutus-ir/src/PlutusIR/Transform/RewriteRules/CommuteFnWithConst.hs
index 4db5179eb6b..0424128e68f 100644
--- a/plutus-core/plutus-ir/src/PlutusIR/Transform/RewriteRules/CommuteFnWithConst.hs
+++ b/plutus-core/plutus-ir/src/PlutusIR/Transform/RewriteRules/CommuteFnWithConst.hs
@@ -130,11 +130,15 @@ isCommutative = \case
   IntegerToByteString             -> False
   ByteStringToInteger             -> False
   -- Currently, this requires commutativity in all arguments, which the
-  -- logical operations are not.
+  -- logical and bitwise operations are not.
   AndByteString                   -> False
   OrByteString                    -> False
   XorByteString                   -> False
   ComplementByteString            -> False
   ReadBit                         -> False
   WriteBits                       -> False
-  ReplicateByteString             -> False
+  ReplicateByte                   -> False
+  ShiftByteString                 -> False
+  RotateByteString                -> False
+  CountSetBits                    -> False
+  FindFirstSetBit                 -> False
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs
new file mode 100644
index 00000000000..55341b27a93
--- /dev/null
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Bitwise.hs
@@ -0,0 +1,446 @@
+-- editorconfig-checker-disable-file
+
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TypeApplications  #-}
+
+-- | Tests for [this
+-- CIP](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-XXXX/CIP-XXXX.md)
+module Evaluation.Builtins.Bitwise (
+  shiftHomomorphism,
+  rotateHomomorphism,
+  csbHomomorphism,
+  shiftClear,
+  rotateRollover,
+  csbRotate,
+  shiftPosClearLow,
+  shiftNegClearHigh,
+  rotateMoveBits,
+  csbComplement,
+  csbInclusionExclusion,
+  csbXor,
+  ffsReplicate,
+  ffsXor,
+  ffsIndex,
+  ffsZero
+  ) where
+
+import Control.Monad (unless)
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Evaluation.Helpers (assertEvaluatesToConstant, evaluateTheSame, evaluateToHaskell,
+                           evaluatesToConstant, forAllByteString, forAllByteStringThat)
+import Hedgehog (Property, forAll, property)
+import Hedgehog.Gen qualified as Gen
+import Hedgehog.Range qualified as Range
+import PlutusCore qualified as PLC
+import PlutusCore.MkPlc (builtin, mkConstant, mkIterAppNoAnn)
+import Test.Tasty (TestTree)
+import Test.Tasty.Hedgehog (testPropertyNamed)
+import Test.Tasty.HUnit (testCase)
+
+-- | Finding the first set bit in a bytestring with only zero bytes should always give -1.
+ffsZero :: Property
+ffsZero = property $ do
+  len <- forAll . Gen.integral . Range.linear 0 $ 512
+  let bs = BS.replicate len 0x00
+  let rhs = mkIterAppNoAnn (builtin () PLC.FindFirstSetBit) [
+        mkConstant @ByteString () bs
+        ]
+  evaluatesToConstant @Integer (negate 1) rhs
+
+-- | If we find a valid index for the first set bit, then:
+--
+-- 1. The specified index should have a set bit; and
+-- 2. Any valid smaller index should have a clear bit.
+--
+-- We 'hack' the generator slightly here to ensure we don't end up with all-zeroes (or the empty
+-- bytestring), as otherwise, the test wouldn't be meaningful.
+ffsIndex :: Property
+ffsIndex = property $ do
+  bs <- forAllByteStringThat (BS.any (/= 0x00)) 0 512
+  let ffsExp = mkIterAppNoAnn (builtin () PLC.FindFirstSetBit) [
+        mkConstant @ByteString () bs
+        ]
+  ix <- evaluateToHaskell ffsExp
+  let hitIxExp = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () ix
+        ]
+  evaluatesToConstant True hitIxExp
+  unless (ix == 0) $ do
+    i <- forAll . Gen.integral . Range.linear 0 $ ix - 1
+    let missIxExp = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+          mkConstant @ByteString () bs,
+          mkConstant @Integer () i
+          ]
+    evaluatesToConstant False missIxExp
+
+-- | For any choice of bytestring, if we XOR it with itself, there should be no set bits; that is,
+-- finding the first set bit should give @-1@.
+ffsXor :: Property
+ffsXor = property $ do
+  bs <- forAllByteString 0 512
+  semantics <- forAll Gen.bool
+  let rhsInner = mkIterAppNoAnn (builtin () PLC.XorByteString) [
+        mkConstant @Bool () semantics,
+        mkConstant @ByteString () bs,
+        mkConstant @ByteString () bs
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.FindFirstSetBit) [
+        rhsInner
+        ]
+  evaluatesToConstant @Integer (negate 1) rhs
+
+-- | If we replicate any byte any (positive) number of times, the first set bit should be the same as
+-- in the case where we replicated it exactly once.
+ffsReplicate :: Property
+ffsReplicate = property $ do
+  n <- forAll . Gen.integral . Range.linear 1 $ 512
+  w8 <- forAll . Gen.integral . Range.linear 0 $ 255
+  let lhsInner = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
+        mkConstant @Integer () n,
+        mkConstant @Integer () w8
+        ]
+  let lhs = mkIterAppNoAnn (builtin () PLC.FindFirstSetBit) [
+        lhsInner
+        ]
+  let rhsInner = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
+        mkConstant @Integer () 1,
+        mkConstant @Integer () w8
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.FindFirstSetBit) [
+        rhsInner
+        ]
+  evaluateTheSame lhs rhs
+
+-- | For any bytestring whose bit length is @n@ and has @k@ set bits, its complement should have
+-- @n - k@ set bits.
+csbComplement :: Property
+csbComplement = property $ do
+  bs <- forAllByteString 0 512
+  let bitLen = BS.length bs * 8
+  let lhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        mkConstant @ByteString () bs
+        ]
+  let rhsComplement = mkIterAppNoAnn (builtin () PLC.ComplementByteString) [
+        mkConstant @ByteString () bs
+        ]
+  let rhsCount = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        rhsComplement
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.SubtractInteger) [
+        mkConstant @Integer () (fromIntegral bitLen),
+        rhsCount
+        ]
+  evaluateTheSame lhs rhs
+
+-- | The inclusion-exclusion principle: specifically, for any @x@ and @y@, the number of set bits in
+-- @x XOR y@ should be the number of set bits in @x OR y@ minus the number of set bits in @x AND y@.
+csbInclusionExclusion :: Property
+csbInclusionExclusion = property $ do
+  x <- forAllByteString 0 512
+  y <- forAllByteString 0 512
+  let lhsInner = mkIterAppNoAnn (builtin () PLC.XorByteString) [
+        mkConstant @Bool () False,
+        mkConstant @ByteString () x,
+        mkConstant @ByteString () y
+        ]
+  let lhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        lhsInner
+        ]
+  let rhsOr = mkIterAppNoAnn (builtin () PLC.OrByteString) [
+        mkConstant @Bool () False,
+        mkConstant @ByteString () x,
+        mkConstant @ByteString () y
+        ]
+  let rhsAnd = mkIterAppNoAnn (builtin () PLC.AndByteString) [
+        mkConstant @Bool () False,
+        mkConstant @ByteString () x,
+        mkConstant @ByteString () y
+        ]
+  let rhsCountOr = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        rhsOr
+        ]
+  let rhsCountAnd = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        rhsAnd
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.SubtractInteger) [
+        rhsCountOr,
+        rhsCountAnd
+        ]
+  evaluateTheSame lhs rhs
+
+-- | For any bytestring @x@, the number of set bits in @x XOR x@ should be 0.
+csbXor :: Property
+csbXor = property $ do
+  bs <- forAllByteString 0 512
+  semantics <- forAll Gen.bool
+  let rhsInner = mkIterAppNoAnn (builtin () PLC.XorByteString) [
+        mkConstant @Bool () semantics,
+        mkConstant @ByteString () bs,
+        mkConstant @ByteString () bs
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        rhsInner
+        ]
+  evaluatesToConstant @Integer 0 rhs
+
+-- | There should exist a monoid homomorphism between natural number addition and function composition for
+-- shifts over a fixed bytestring argument.
+shiftHomomorphism :: [TestTree]
+shiftHomomorphism = [
+  testPropertyNamed "zero shift is identity" "zero_shift_id" idProp,
+  -- Because the homomorphism on shifts is more restrictive than on rotations (namely, it is for
+  -- naturals and their negative equivalents, not integers), we separate the composition property
+  -- into two: one dealing with non-negative, the other with non-positive. This helps a bit with
+  -- coverage, as otherwise, we wouldn't necessarily cover both paths equally well, as we'd have to
+  -- either discard mismatched signs (which are likely) or 'hack them in-place', which would skew
+  -- distributions.
+  testPropertyNamed "non-negative addition of shifts is composition" "plus_shift_pos_comp" plusCompProp,
+  testPropertyNamed "non-positive addition of shifts is composition" "plus_shift_neg_comp" minusCompProp
+  ]
+  where
+    idProp :: Property
+    idProp = property $ do
+      bs <- forAllByteString 0 512
+      let lhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () 0
+            ]
+      evaluatesToConstant bs lhs
+    plusCompProp :: Property
+    plusCompProp = property $ do
+      bs <- forAllByteString 0 512
+      i <- forAll . Gen.integral . Range.linear 0 $ 512
+      j <- forAll . Gen.integral . Range.linear 0 $ 512
+      let lhsInner = mkIterAppNoAnn (builtin () PLC.AddInteger) [
+            mkConstant @Integer () i,
+            mkConstant @Integer () j
+            ]
+      let lhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            mkConstant @ByteString () bs,
+            lhsInner
+            ]
+      let rhsInner = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () i
+            ]
+      let rhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            rhsInner,
+            mkConstant @Integer () j
+            ]
+      evaluateTheSame lhs rhs
+    minusCompProp :: Property
+    minusCompProp = property $ do
+      bs <- forAllByteString 0 512
+      i <- forAll . Gen.integral . Range.linear 0 $ negate 512
+      j <- forAll . Gen.integral . Range.linear 0 $ negate 512
+      let lhsInner = mkIterAppNoAnn (builtin () PLC.AddInteger) [
+            mkConstant @Integer () i,
+            mkConstant @Integer () j
+            ]
+      let lhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            mkConstant @ByteString () bs,
+            lhsInner
+            ]
+      let rhsInner = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () i
+            ]
+      let rhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+            rhsInner,
+            mkConstant @Integer () j
+            ]
+      evaluateTheSame lhs rhs
+
+-- | There should exist a monoid homomorphism between integer addition and function composition for
+-- rotations over a fixed bytestring argument.
+rotateHomomorphism :: [TestTree]
+rotateHomomorphism = [
+  testPropertyNamed "zero rotation is identity" "zero_rotate_id" idProp,
+  testPropertyNamed "addition of rotations is composition" "plus_rotate_comp" compProp
+  ]
+  where
+    idProp :: Property
+    idProp = property $ do
+      bs <- forAllByteString 0 512
+      let lhs = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () 0
+            ]
+      evaluatesToConstant bs lhs
+    compProp :: Property
+    compProp = property $ do
+      bs <- forAllByteString 0 512
+      i <- forAll . Gen.integral . Range.linear (negate 512) $ 512
+      j <- forAll . Gen.integral . Range.linear (negate 512) $ 512
+      let lhsInner = mkIterAppNoAnn (builtin () PLC.AddInteger) [
+            mkConstant @Integer () i,
+            mkConstant @Integer () j
+            ]
+      let lhs = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+            mkConstant @ByteString () bs,
+            lhsInner
+            ]
+      let rhsInner = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+            mkConstant @ByteString () bs,
+            mkConstant @Integer () i
+            ]
+      let rhs = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+            rhsInner,
+            mkConstant @Integer () j
+            ]
+      evaluateTheSame lhs rhs
+
+-- | There should exist a monoid homomorphism between bytestring concatenation and natural number
+-- addition.
+csbHomomorphism :: [TestTree]
+csbHomomorphism = [
+  testCase "count of empty is zero" $ do
+    let lhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+          mkConstant @ByteString () ""
+          ]
+    assertEvaluatesToConstant @Integer 0 lhs,
+  testPropertyNamed "count of concat is addition" "concat_count_plus" compProp
+  ]
+  where
+    compProp :: Property
+    compProp = property $ do
+      bs1 <- forAllByteString 0 512
+      bs2 <- forAllByteString 0 512
+      let lhsInner = mkIterAppNoAnn (builtin () PLC.AppendByteString) [
+            mkConstant @ByteString () bs1,
+            mkConstant @ByteString () bs2
+            ]
+      let lhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+            lhsInner
+            ]
+      let rhsLeft = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+            mkConstant @ByteString () bs1
+            ]
+      let rhsRight = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+            mkConstant @ByteString () bs2
+            ]
+      let rhs = mkIterAppNoAnn (builtin () PLC.AddInteger) [
+            rhsLeft,
+            rhsRight
+            ]
+      evaluateTheSame lhs rhs
+
+-- | Shifting by more than the bit length (either positive or negative) clears the result.
+shiftClear :: Property
+shiftClear = property $ do
+  bs <- forAllByteString 0 512
+  let bitLen = 8 * BS.length bs
+  i <- forAll . Gen.integral . Range.linear (negate 256) $ 256
+  adjustment <- case signum i of
+    (-1) -> pure $ negate bitLen + i
+    -- Here, we shift by the length exactly, so we randomly pick negative or positive
+    0    -> forAll . Gen.element $ [bitLen, negate bitLen]
+    _    -> pure $ bitLen + i
+  let lhs = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (fromIntegral adjustment)
+        ]
+  let rhsInner = mkIterAppNoAnn (builtin () PLC.LengthOfByteString) [
+        mkConstant @ByteString () bs
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
+        rhsInner,
+        mkConstant @Integer () 0
+        ]
+  evaluateTheSame lhs rhs
+
+-- | Positive shifts clear low-index bits.
+shiftPosClearLow :: Property
+shiftPosClearLow = property $ do
+  bs <- forAllByteString 1 512
+  let bitLen = 8 * BS.length bs
+  n <- forAll . Gen.integral . Range.linear 1 $ bitLen - 1
+  i <- forAll . Gen.integral . Range.linear 0 $ n - 1
+  let lhsInner = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (fromIntegral n)
+        ]
+  let lhs = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+        lhsInner,
+        mkConstant @Integer () (fromIntegral i)
+        ]
+  evaluatesToConstant False lhs
+
+-- | Negative shifts clear high-index bits.
+shiftNegClearHigh :: Property
+shiftNegClearHigh = property $ do
+  bs <- forAllByteString 1 512
+  let bitLen = 8 * BS.length bs
+  n <- forAll . Gen.integral . Range.linear 1 $ bitLen - 1
+  i <- forAll . Gen.integral . Range.linear 0 $ n - 1
+  let lhsInner = mkIterAppNoAnn (builtin () PLC.ShiftByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (fromIntegral . negate $ n)
+        ]
+  let lhs = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+        lhsInner,
+        mkConstant @Integer () (fromIntegral $ bitLen - i - 1)
+        ]
+  evaluatesToConstant False lhs
+
+-- | Rotations by more than the bit length 'roll over' bits.
+rotateRollover :: Property
+rotateRollover = property $ do
+  bs <- forAllByteString 0 512
+  let bitLen = 8 * BS.length bs
+  i <- forAll . Gen.integral . Range.linear (negate 512) $ 512
+  let lhs = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (case signum i of
+                                  (-1) -> (negate . fromIntegral $ bitLen) + i
+                                  _    -> fromIntegral bitLen + i)
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () i
+        ]
+  evaluateTheSame lhs rhs
+
+-- | Rotations move bits, but don't change them.
+rotateMoveBits :: Property
+rotateMoveBits = property $ do
+  bs <- forAllByteString 1 512
+  let bitLen = 8 * BS.length bs
+  i <- forAll . Gen.integral . Range.linear 0 $ bitLen - 1
+  j <- forAll . Gen.integral . Range.linear (negate 256) $ 256
+  let lhs = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (fromIntegral i)
+        ]
+  let rhsRotation = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () (fromIntegral j)
+        ]
+  let rhsIndex = mkIterAppNoAnn (builtin () PLC.ModInteger) [
+        mkConstant @Integer () (fromIntegral $ i + j),
+        mkConstant @Integer () (fromIntegral bitLen)
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.ReadBit) [
+        rhsRotation,
+        rhsIndex
+        ]
+  evaluateTheSame lhs rhs
+
+-- | Rotations do not change how many set (and clear) bits there are.
+csbRotate :: Property
+csbRotate = property $ do
+  bs <- forAllByteString 0 512
+  i <- forAll . Gen.integral . Range.linear (negate 512) $ 512
+  let lhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        mkConstant @ByteString () bs
+        ]
+  let rhsInner = mkIterAppNoAnn (builtin () PLC.RotateByteString) [
+        mkConstant @ByteString () bs,
+        mkConstant @Integer () i
+        ]
+  let rhs = mkIterAppNoAnn (builtin () PLC.CountSetBits) [
+        rhsInner
+        ]
+  evaluateTheSame lhs rhs
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Conversion.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Conversion.hs
index ba5929d7ff1..82653a7b600 100644
--- a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Conversion.hs
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Conversion.hs
@@ -21,7 +21,7 @@ module Evaluation.Builtins.Conversion (
 
 import Evaluation.Builtins.Common (typecheckEvaluateCek)
 import PlutusCore qualified as PLC
-import PlutusCore.Bitwise.Convert (integerToByteStringMaximumOutputLength)
+import PlutusCore.Bitwise (integerToByteStringMaximumOutputLength)
 import PlutusCore.Evaluation.Machine.ExBudgetingDefaults (defaultBuiltinCostModelForTesting)
 import PlutusCore.MkPlc (builtin, mkConstant, mkIterAppNoAnn)
 import PlutusPrelude (Word8, def)
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
index 060fc0f2b35..fab83cf11a4 100644
--- a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Definition.hs
@@ -28,6 +28,8 @@ import PlutusCore.Pretty
 import PlutusPrelude
 import UntypedPlutusCore.Evaluation.Machine.Cek
 
+import Evaluation.Builtins.Bitwise qualified as Bitwise
+import Hedgehog hiding (Opaque, Size, Var)
 import PlutusCore qualified as PLC
 import PlutusCore.Examples.Builtins
 import PlutusCore.Examples.Data.Data
@@ -54,7 +56,6 @@ import Evaluation.Builtins.Laws qualified as Laws
 import Evaluation.Builtins.SignatureVerification (ecdsaSecp256k1Prop, ed25519_VariantAProp,
                                                   ed25519_VariantBProp, ed25519_VariantCProp,
                                                   schnorrSecp256k1Prop)
-import Hedgehog hiding (Opaque, Size, Var)
 import Hedgehog.Gen qualified as Gen
 import Hedgehog.Range qualified as Range
 import Prettyprinter (vsep)
@@ -887,7 +888,7 @@ cons = mkConstant ()
 -- Test that the SECP256k1 builtins are behaving correctly
 test_SignatureVerification :: TestTree
 test_SignatureVerification =
-  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 8000) .
+  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 4000) .
   testGroup "Signature verification" $ [
         testGroup "Ed25519 signatures (VariantA)"
                       [ testPropertyNamed
@@ -922,7 +923,7 @@ test_SignatureVerification =
 -- Test that the Integer <-> ByteString conversion builtins are behaving correctly
 test_Conversion :: TestTree
 test_Conversion =
-    adjustOption (\x -> max x . HedgehogTestLimit . Just $ 8000) .
+    adjustOption (\x -> max x . HedgehogTestLimit . Just $ 4000) .
     testGroup "Integer <-> ByteString conversions" $ [
       testGroup "Integer -> ByteString" [
         --- lengthOfByteString (integerToByteString e d 0) = d
@@ -958,10 +959,55 @@ test_Conversion =
         ]
       ]
 
+-- Tests of the laws from [this
+-- CIP](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-XXXX/CIP-XXXX.md)
+test_Bitwise :: TestTree
+test_Bitwise =
+  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 4000) .
+  testGroup "Bitwise" $ [
+    testGroup "shiftByteString" [
+      testGroup "homomorphism" Bitwise.shiftHomomorphism,
+      testPropertyNamed "shifts over bit length clear input" "shift_too_much"
+                        Bitwise.shiftClear,
+      testPropertyNamed "positive shifts clear low indexes" "shift_pos_low"
+                        Bitwise.shiftPosClearLow,
+      testPropertyNamed "negative shifts clear high indexes" "shift_neg_high"
+                        Bitwise.shiftNegClearHigh
+    ],
+  testGroup "rotateByteString" [
+      testGroup "homomorphism" Bitwise.rotateHomomorphism,
+      testPropertyNamed "rotations over bit length roll over" "rotate_too_much"
+                        Bitwise.rotateRollover,
+      testPropertyNamed "rotations move bits but don't change them" "rotate_move"
+                        Bitwise.rotateMoveBits
+    ],
+  testGroup "countSetBits" [
+      testGroup "homomorphism" Bitwise.csbHomomorphism,
+      testPropertyNamed "rotation preserves count" "popcount_rotate"
+                        Bitwise.csbRotate,
+      testPropertyNamed "count of the complement" "popcount_complement"
+                        Bitwise.csbComplement,
+      testPropertyNamed "inclusion-exclusion" "popcount_inclusion_exclusion"
+                        Bitwise.csbInclusionExclusion,
+      testPropertyNamed "count of self-XOR" "popcount_self_xor"
+                        Bitwise.csbXor
+    ],
+  testGroup "findFirstSetBit" [
+      testPropertyNamed "find first in zero bytestrings" "ffs_zero"
+                        Bitwise.ffsZero,
+      testPropertyNamed "find first in replicated" "ffs_replicate"
+                        Bitwise.ffsReplicate,
+      testPropertyNamed "find first of self-XOR" "ffs_xor"
+                        Bitwise.ffsXor,
+      testPropertyNamed "found index set, lower indices clear" "ffs_index"
+                        Bitwise.ffsIndex
+    ]
+  ]
+
 -- Tests for the logical operations, as per [CIP-122](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md)
 test_Logical :: TestTree
 test_Logical =
-  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 4000) .
+  adjustOption (\x -> max x . HedgehogTestLimit . Just $ 2000) .
   testGroup "Logical" $ [
     testGroup "andByteString" [
       Laws.abelianSemigroupLaws "truncation" PLC.AndByteString False,
@@ -988,7 +1034,7 @@ test_Logical =
       Laws.xorInvoluteLaw,
       Laws.abelianMonoidLaws "padding" PLC.XorByteString True ""
       ],
-    testGroup "bitwiseLogicalComplement" [
+    testGroup "complementByteString" [
       Laws.complementSelfInverse,
       Laws.deMorgan
       ],
@@ -998,7 +1044,7 @@ test_Logical =
       Laws.setSet,
       Laws.writeBitsHomomorphismLaws
       ],
-    testGroup "replicateByteString" [
+    testGroup "replicateByte" [
       Laws.replicateHomomorphismLaws,
       Laws.replicateIndex
       ]
@@ -1042,4 +1088,5 @@ test_definition =
         , test_ConsByteString
         , test_Conversion
         , test_Logical
+        , test_Bitwise
         ]
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Laws.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Laws.hs
index a7bbe8021ea..cce4f034f9d 100644
--- a/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Laws.hs
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Builtins/Laws.hs
@@ -22,17 +22,14 @@ module Evaluation.Builtins.Laws (
 
 import Data.ByteString (ByteString)
 import Data.ByteString qualified as BS
-import Evaluation.Builtins.Common (typecheckEvaluateCek, typecheckReadKnownCek)
+import Evaluation.Helpers (evaluateTheSame, evaluateToHaskell, evaluatesToConstant,
+                           forAllByteString)
 import GHC.Exts (fromString)
-import Hedgehog (Gen, Property, PropertyT, annotateShow, failure, forAll, forAllWith, property,
-                 (===))
+import Hedgehog (Gen, Property, PropertyT, forAll, forAllWith, property)
 import Hedgehog.Gen qualified as Gen
 import Hedgehog.Range qualified as Range
-import Numeric (showHex)
 import PlutusCore qualified as PLC
-import PlutusCore.Evaluation.Machine.ExBudgetingDefaults (defaultBuiltinCostModelForTesting)
 import PlutusCore.MkPlc (builtin, mkConstant, mkIterAppNoAnn)
-import PlutusPrelude (Word8, def)
 import Test.Tasty (TestTree, testGroup)
 import Test.Tasty.Hedgehog (testPropertyNamed)
 import UntypedPlutusCore qualified as UPLC
@@ -41,10 +38,10 @@ import UntypedPlutusCore qualified as UPLC
 -- every valid index, namely the byte specified.
 replicateIndex :: TestTree
 replicateIndex = testPropertyNamed "every byte is the same" "replicate_all_match" . property $ do
-  n <- forAll . Gen.integral . Range.linear 1 $ 1024
+  n <- forAll . Gen.integral . Range.linear 1 $ 512
   b <- forAll . Gen.integral . Range.constant 0 $ 255
   i <- forAll . Gen.integral . Range.linear 0 $ n - 1
-  let lhsInner = mkIterAppNoAnn (builtin () PLC.ReplicateByteString) [
+  let lhsInner = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
         mkConstant @Integer () n,
         mkConstant @Integer () b
         ]
@@ -52,43 +49,32 @@ replicateIndex = testPropertyNamed "every byte is the same" "replicate_all_match
         lhsInner,
         mkConstant @Integer () i
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsInteger) [
-        lhs,
-        mkConstant @Integer () b
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluatesToConstant @Integer b lhs
 
 -- | If you retrieve a bit value at an index, then write that same value to
 -- the same index, nothing should happen.
 getSet :: TestTree
 getSet =
   testPropertyNamed "get-set" "get_set" . property $ do
-    bs <- forAllByteString1
+    bs <- forAllByteString 1 512
     i <- forAllIndexOf bs
     let lookupExp = mkIterAppNoAnn (builtin () PLC.ReadBit) [
           mkConstant @ByteString () bs,
           mkConstant @Integer () i
           ]
-    case typecheckReadKnownCek def defaultBuiltinCostModelForTesting lookupExp of
-      Left err -> annotateShow err >> failure
-      Right (Left err) -> annotateShow err >> failure
-      Right (Right b) -> do
-        let lhs = mkIterAppNoAnn (builtin () PLC.WriteBits) [
-              mkConstant @ByteString () bs,
-              mkConstant @[(Integer, Bool)] () [(i, b)]
-              ]
-        let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-              lhs,
-              mkConstant @ByteString () bs
-              ]
-        evaluateAndVerify (mkConstant @Bool () True) compareExp
+    b <- evaluateToHaskell lookupExp
+    let lhs = mkIterAppNoAnn (builtin () PLC.WriteBits) [
+          mkConstant @ByteString () bs,
+          mkConstant @[(Integer, Bool)] () [(i, b)]
+          ]
+    evaluatesToConstant bs lhs
 
 -- | If you write a bit value to an index, then retrieve the bit value at the
 -- same index, you should get back what you wrote.
 setGet :: TestTree
 setGet =
   testPropertyNamed "set-get" "set_get" . property $ do
-    bs <- forAllByteString1
+    bs <- forAllByteString 1 512
     i <- forAllIndexOf bs
     b <- forAll Gen.bool
     let lhsInner = mkIterAppNoAnn (builtin () PLC.WriteBits) [
@@ -99,13 +85,13 @@ setGet =
           lhsInner,
           mkConstant @Integer () i
           ]
-    evaluateAndVerify (mkConstant @Bool () b) lhs
+    evaluatesToConstant b lhs
 
 -- | If you write twice to the same bit index, the second write should win.
 setSet :: TestTree
 setSet =
   testPropertyNamed "set-set" "set_set" . property $ do
-    bs <- forAllByteString1
+    bs <- forAllByteString 1 512
     i <- forAllIndexOf bs
     b1 <- forAll Gen.bool
     b2 <- forAll Gen.bool
@@ -117,11 +103,7 @@ setSet =
           mkConstant @ByteString () bs,
           mkConstant @[(Integer, Bool)] () [(i, b2)]
           ]
-    let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-          lhs,
-          rhs
-          ]
-    evaluateAndVerify (mkConstant @Bool () True) compareExp
+    evaluateTheSame lhs rhs
 
 -- | Checks that:
 --
@@ -137,19 +119,15 @@ writeBitsHomomorphismLaws =
     where
       identityProp :: Property
       identityProp = property $ do
-        bs <- forAllByteString1
+        bs <- forAllByteString 1 512
         let lhs = mkIterAppNoAnn (builtin () PLC.WriteBits) [
               mkConstant @ByteString () bs,
               mkConstant @[(Integer, Bool)] () []
               ]
-        let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-              lhs,
-              mkConstant @ByteString () bs
-              ]
-        evaluateAndVerify (mkConstant @Bool () True) compareExp
+        evaluatesToConstant bs lhs
       compositionProp :: Property
       compositionProp = property $ do
-        bs <- forAllByteString1
+        bs <- forAllByteString 1 512
         changelist1 <- forAllChangelistOf bs
         changelist2 <- forAllChangelistOf bs
         let lhsInner = mkIterAppNoAnn (builtin () PLC.WriteBits) [
@@ -164,11 +142,7 @@ writeBitsHomomorphismLaws =
               mkConstant @ByteString () bs,
               mkConstant @[(Integer, Bool)] () (changelist1 <> changelist2)
               ]
-        let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-              lhs,
-              rhs
-              ]
-        evaluateAndVerify (mkConstant @Bool () True) compareExp
+        evaluateTheSame lhs rhs
 
 -- | Checks that:
 --
@@ -186,25 +160,21 @@ replicateHomomorphismLaws =
     identityProp :: Property
     identityProp = property $ do
       b <- forAll . Gen.integral . Range.constant 0 $ 255
-      let lhs = mkIterAppNoAnn (builtin () PLC.ReplicateByteString) [
+      let lhs = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
             mkConstant @Integer () 0,
             mkConstant @Integer () b
             ]
-      let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-            lhs,
-            mkConstant @ByteString () ""
-            ]
-      evaluateAndVerify (mkConstant @Bool () True) compareExp
+      evaluatesToConstant @ByteString "" lhs
     compositionProp :: Property
     compositionProp = property $ do
       b <- forAll . Gen.integral . Range.constant 0 $ 255
       n1 <- forAll . Gen.integral . Range.linear 0 $ 512
       n2 <- forAll . Gen.integral . Range.linear 0 $ 512
-      let lhsInner1 = mkIterAppNoAnn (builtin () PLC.ReplicateByteString) [
+      let lhsInner1 = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
             mkConstant @Integer () n1,
             mkConstant @Integer () b
             ]
-      let lhsInner2 = mkIterAppNoAnn (builtin () PLC.ReplicateByteString) [
+      let lhsInner2 = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
             mkConstant @Integer () n2,
             mkConstant @Integer () b
             ]
@@ -212,32 +182,24 @@ replicateHomomorphismLaws =
             lhsInner1,
             lhsInner2
             ]
-      let rhs = mkIterAppNoAnn (builtin () PLC.ReplicateByteString) [
+      let rhs = mkIterAppNoAnn (builtin () PLC.ReplicateByte) [
             mkConstant @Integer () (n1 + n2),
             mkConstant @Integer () b
             ]
-      let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-            lhs,
-            rhs
-            ]
-      evaluateAndVerify (mkConstant @Bool () True) compareExp
+      evaluateTheSame lhs rhs
 
 -- | If you complement a 'ByteString' twice, nothing should change.
 complementSelfInverse :: TestTree
 complementSelfInverse =
   testPropertyNamed "self-inverse" "self_inverse" . property $ do
-    bs <- forAllByteString
+    bs <- forAllByteString 0 512
     let lhsInner = mkIterAppNoAnn (builtin () PLC.ComplementByteString) [
           mkConstant @ByteString () bs
           ]
     let lhs = mkIterAppNoAnn (builtin () PLC.ComplementByteString) [
           lhsInner
           ]
-    let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-          lhs,
-          mkConstant @ByteString () bs
-          ]
-    evaluateAndVerify (mkConstant @Bool () True) compareExp
+    evaluatesToConstant bs lhs
 
 -- | Checks that:
 --
@@ -252,8 +214,8 @@ deMorgan = testGroup "De Morgan's laws" [
     go :: UPLC.DefaultFun -> UPLC.DefaultFun -> Property
     go f g = property $ do
       semantics <- forAllWith showSemantics Gen.bool
-      bs1 <- forAllByteString
-      bs2 <- forAllByteString
+      bs1 <- forAllByteString 0 512
+      bs2 <- forAllByteString 0 512
       let lhsInner = mkIterAppNoAnn (builtin () f) [
             mkConstant @Bool () semantics,
             mkConstant @ByteString () bs1,
@@ -273,16 +235,12 @@ deMorgan = testGroup "De Morgan's laws" [
             rhsInner1,
             rhsInner2
             ]
-      let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-            lhs,
-            rhs
-            ]
-      evaluateAndVerify (mkConstant @Bool () True) compareExp
+      evaluateTheSame lhs rhs
 
 -- | If you XOR any 'ByteString' with itself twice, nothing should change.
 xorInvoluteLaw :: TestTree
 xorInvoluteLaw = testPropertyNamed "involute (both)" "involute_both" . property $ do
-  bs <- forAllByteString
+  bs <- forAllByteString 0 512
   semantics <- forAllWith showSemantics Gen.bool
   let lhsInner = mkIterAppNoAnn (builtin () PLC.XorByteString) [
         mkConstant @Bool () semantics,
@@ -294,11 +252,7 @@ xorInvoluteLaw = testPropertyNamed "involute (both)" "involute_both" . property
         mkConstant @ByteString () bs,
         lhsInner
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        mkConstant @ByteString () bs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluatesToConstant bs lhs
 
 -- | Checks that the first 'DefaultFun' distributes over the second from the
 -- left, given the specified semantics (as a 'Bool'). More precisely, for
@@ -346,17 +300,13 @@ idempotenceLaw name f isPadding =
   where
     idempProp :: Property
     idempProp = property $ do
-      bs <- forAllByteString
+      bs <- forAllByteString 0 512
       let lhs = mkIterAppNoAnn (builtin () f) [
             mkConstant @Bool () isPadding,
             mkConstant @ByteString () bs,
             mkConstant @ByteString () bs
             ]
-      let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-            lhs,
-            mkConstant @ByteString () bs
-            ]
-      evaluateAndVerify (mkConstant @Bool () True) compareExp
+      evaluatesToConstant bs lhs
 
 -- | Checks that the provided 'ByteString' is an absorbing element for the
 -- given 'DefaultFun', under the given semantics. Specifically, given @f@
@@ -370,17 +320,13 @@ absorbtionLaw name f isPadding absorber =
   where
     absorbProp :: Property
     absorbProp = property $ do
-      bs <- forAllByteString
+      bs <- forAllByteString 0 512
       let lhs = mkIterAppNoAnn (builtin () f) [
             mkConstant @Bool () isPadding,
             mkConstant @ByteString () bs,
             mkConstant @ByteString () absorber
             ]
-      let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-            mkConstant @ByteString () absorber,
-            lhs
-            ]
-      evaluateAndVerify (mkConstant @Bool () True) compareExp
+      evaluatesToConstant absorber lhs
 
 -- Helpers
 
@@ -391,9 +337,9 @@ showSemantics b = if b
 
 leftDistProp :: UPLC.DefaultFun -> UPLC.DefaultFun -> Bool -> Property
 leftDistProp f distOp isPadding = property $ do
-  x <- forAllByteString
-  y <- forAllByteString
-  z <- forAllByteString
+  x <- forAllByteString 0 512
+  y <- forAllByteString 0 512
+  z <- forAllByteString 0 512
   let distLhs = mkIterAppNoAnn (builtin () distOp) [
         mkConstant @Bool () isPadding,
         mkConstant @ByteString () y,
@@ -419,17 +365,13 @@ leftDistProp f distOp isPadding = property $ do
         distRhs1,
         distRhs2
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        rhs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluateTheSame lhs rhs
 
 rightDistProp :: UPLC.DefaultFun -> Bool -> Property
 rightDistProp f isPadding = property $ do
-  x <- forAllByteString
-  y <- forAllByteString
-  z <- forAllByteString
+  x <- forAllByteString 0 512
+  y <- forAllByteString 0 512
+  z <- forAllByteString 0 512
   let lhsInner = mkIterAppNoAnn (builtin () f) [
         mkConstant @Bool () isPadding,
         mkConstant @ByteString () x,
@@ -455,16 +397,12 @@ rightDistProp f isPadding = property $ do
         rhsInner1,
         rhsInner2
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        rhs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluateTheSame lhs rhs
 
 commProp :: UPLC.DefaultFun -> Bool -> Property
 commProp f isPadding = property $ do
-  data1 <- forAllByteString
-  data2 <- forAllByteString
+  data1 <- forAllByteString 0 512
+  data2 <- forAllByteString 0 512
   let lhs = mkIterAppNoAnn (builtin () f) [
         mkConstant @Bool () isPadding,
         mkConstant @ByteString () data1,
@@ -475,17 +413,13 @@ commProp f isPadding = property $ do
         mkConstant @ByteString () data2,
         mkConstant @ByteString () data1
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        rhs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluateTheSame lhs rhs
 
 assocProp :: UPLC.DefaultFun -> Bool -> Property
 assocProp f isPadding = property $ do
-  data1 <- forAllByteString
-  data2 <- forAllByteString
-  data3 <- forAllByteString
+  data1 <- forAllByteString 0 512
+  data2 <- forAllByteString 0 512
+  data3 <- forAllByteString 0 512
   let data12 = mkIterAppNoAnn (builtin () f) [
         mkConstant @Bool () isPadding,
         mkConstant @ByteString () data1,
@@ -506,31 +440,17 @@ assocProp f isPadding = property $ do
         mkConstant @ByteString () data1,
         data23
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        rhs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
+  evaluateTheSame lhs rhs
 
 unitProp :: UPLC.DefaultFun -> Bool -> ByteString -> Property
 unitProp f isPadding unit = property $ do
-  bs <- forAllByteString
+  bs <- forAllByteString 0 512
   let lhs = mkIterAppNoAnn (builtin () f) [
         mkConstant @Bool () isPadding,
         mkConstant @ByteString () bs,
         mkConstant @ByteString () unit
         ]
-  let compareExp = mkIterAppNoAnn (builtin () PLC.EqualsByteString) [
-        lhs,
-        mkConstant @ByteString () bs
-        ]
-  evaluateAndVerify (mkConstant @Bool () True) compareExp
-
-forAllByteString :: PropertyT IO ByteString
-forAllByteString = forAllWith hexShow . Gen.bytes . Range.linear 0 $ 1024
-
-forAllByteString1 :: PropertyT IO ByteString
-forAllByteString1 = forAllWith hexShow . Gen.bytes . Range.linear 1 $ 1024
+  evaluatesToConstant bs lhs
 
 forAllIndexOf :: ByteString -> PropertyT IO Integer
 forAllIndexOf bs = forAll . Gen.integral . Range.linear 0 . fromIntegral $ BS.length bs * 8 - 1
@@ -543,23 +463,3 @@ forAllChangelistOf bs =
     len = BS.length bs
     genIndex :: Gen Integer
     genIndex = Gen.integral . Range.linear 0 . fromIntegral $ len * 8 - 1
-
-hexShow :: ByteString -> String
-hexShow = ("0x" <>) . BS.foldl' (\acc w8 -> acc <> byteToHex w8) ""
-  where
-    byteToHex :: Word8 -> String
-    byteToHex w8
-      | w8 < 128 = "0" <> showHex w8 ""
-      | otherwise = showHex w8 ""
-
-evaluateAndVerify ::
-  UPLC.Term UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
-  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
-  PropertyT IO ()
-evaluateAndVerify expected actual =
-  case typecheckEvaluateCek def defaultBuiltinCostModelForTesting actual of
-    Left x -> annotateShow x >> failure
-    Right (res, logs) -> case res of
-      PLC.EvaluationFailure   -> annotateShow logs >> failure
-      PLC.EvaluationSuccess r -> r === expected
-
diff --git a/plutus-core/untyped-plutus-core/test/Evaluation/Helpers.hs b/plutus-core/untyped-plutus-core/test/Evaluation/Helpers.hs
new file mode 100644
index 00000000000..46a18553fed
--- /dev/null
+++ b/plutus-core/untyped-plutus-core/test/Evaluation/Helpers.hs
@@ -0,0 +1,136 @@
+-- editorconfig-checker-disable-file
+{-# LANGUAGE KindSignatures #-}
+{-# LANGUAGE LambdaCase     #-}
+
+-- | Various helpers for defining evaluation tests.
+module Evaluation.Helpers (
+  -- * Generators
+  forAllByteString,
+  forAllByteStringThat,
+  -- * Evaluation helpers
+  evaluateTheSame,
+  evaluatesToConstant,
+  assertEvaluatesToConstant,
+  evaluateToHaskell,
+  ) where
+
+import Data.ByteString (ByteString)
+import Data.ByteString qualified as BS
+import Data.Kind (Type)
+import Evaluation.Builtins.Common (typecheckEvaluateCek, typecheckReadKnownCek)
+import GHC.Stack (HasCallStack)
+import Hedgehog (PropertyT, annotateShow, failure, forAllWith, (===))
+import Hedgehog.Gen qualified as Gen
+import Hedgehog.Range qualified as Range
+import Numeric (showHex)
+import PlutusCore qualified as PLC
+import PlutusCore.Builtin (ReadKnownIn)
+import PlutusCore.Evaluation.Machine.ExBudgetingDefaults (defaultBuiltinCostModelForTesting)
+import PlutusCore.MkPlc (mkConstant)
+import PlutusPrelude (Word8, def)
+import Test.Tasty.HUnit (assertEqual, assertFailure)
+import UntypedPlutusCore qualified as UPLC
+
+-- | Given a lower and upper bound (both inclusive) on length, generate a 'ByteString' whose length
+-- falls within these bounds. Furthermore, the generated 'ByteString' will show as a list of
+-- hex-encoded bytes on a failure, instead of the default 'Show' output.
+--
+-- = Note
+--
+-- It is the caller's responsibility to ensure that the bounds are sensible: that is, that neither
+-- the upper or lower bound are negative, and that the lower bound is not greater than the upper
+-- bound.
+forAllByteString :: forall (m :: Type -> Type) .
+  (Monad m, HasCallStack) =>
+  Int -> Int -> PropertyT m ByteString
+forAllByteString lo = forAllWith hexShow . Gen.bytes . Range.linear lo
+
+-- | As 'forAllByteString', but with a postcondition.
+--
+-- = Note
+--
+-- If the postcondition is unlikely, the generator may eventually fail after too many retries.
+-- Ensure that the postcondition is likely to avoid problems.
+forAllByteStringThat :: forall (m :: Type -> Type) .
+  (Monad m, HasCallStack) =>
+  (ByteString -> Bool) -> Int -> Int -> PropertyT m ByteString
+forAllByteStringThat p lo = forAllWith hexShow . Gen.filterT p . Gen.bytes . Range.linear lo
+
+-- | Typechecks and evaluates both PLC expressions. If either of them fail to typecheck, fail the
+-- test, noting what the failure was. If both typecheck, but either errors when run, fail the test,
+-- noting the log(s) for any failing expression. If both run without error, compare the results
+-- using '==='.
+evaluateTheSame ::
+  HasCallStack =>
+  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
+  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
+  PropertyT IO ()
+evaluateTheSame lhs rhs =
+  case typecheckEvaluateCek def defaultBuiltinCostModelForTesting lhs of
+    Left x -> annotateShow x >> failure
+    Right (resLhs, logsLhs) -> case typecheckEvaluateCek def defaultBuiltinCostModelForTesting rhs of
+      Left x -> annotateShow x >> failure
+      Right (resRhs, logsRhs) -> case (resLhs, resRhs) of
+        (PLC.EvaluationFailure, PLC.EvaluationFailure) -> do
+          annotateShow logsLhs
+          annotateShow logsRhs
+          failure
+        (PLC.EvaluationSuccess rLhs, PLC.EvaluationSuccess rRhs) -> rLhs === rRhs
+        (PLC.EvaluationFailure, _) -> annotateShow logsLhs >> failure
+        (_, PLC.EvaluationFailure) -> annotateShow logsRhs >> failure
+
+-- | As 'evaluateTheSame', but for cases where we want to compare a more complex computation to a
+-- constant (as if by @mkConstant@). This is slightly more efficient.
+evaluatesToConstant :: forall (a :: Type) .
+  PLC.Contains UPLC.DefaultUni a =>
+  a ->
+  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
+  PropertyT IO ()
+evaluatesToConstant k expr =
+  case typecheckEvaluateCek def defaultBuiltinCostModelForTesting expr of
+    Left err -> annotateShow err >> failure
+    Right (res, logs) -> case res of
+      PLC.EvaluationFailure   -> annotateShow logs >> failure
+      PLC.EvaluationSuccess r -> r === mkConstant () k
+
+-- | Given a PLC expression and an intended type (via a type argument), typecheck the expression,
+-- evaluate it, then produce the required Haskell value from the results. If we fail at any stage,
+-- instead fail the test and report the failure.
+evaluateToHaskell :: forall (a :: Type) .
+  ReadKnownIn UPLC.DefaultUni (UPLC.Term UPLC.Name UPLC.DefaultUni UPLC.DefaultFun ()) a =>
+  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
+  PropertyT IO a
+evaluateToHaskell expr =
+  case typecheckReadKnownCek def defaultBuiltinCostModelForTesting expr of
+    Left err         -> annotateShow err >> failure
+    Right (Left err) -> annotateShow err >> failure
+    Right (Right x)  -> pure x
+
+-- | As 'evaluatesToConstant', but for a unit instead of a property.
+assertEvaluatesToConstant :: forall (a :: Type) .
+  PLC.Contains UPLC.DefaultUni a =>
+  a ->
+  PLC.Term UPLC.TyName UPLC.Name UPLC.DefaultUni UPLC.DefaultFun () ->
+  IO ()
+assertEvaluatesToConstant k expr =
+  case typecheckEvaluateCek def defaultBuiltinCostModelForTesting expr of
+    Left err -> assertFailure . show $ err
+    Right (res, logs) -> case res of
+      PLC.EvaluationFailure   -> assertFailure . show $ logs
+      PLC.EvaluationSuccess r -> assertEqual "" r (mkConstant () k)
+
+-- Helpers
+
+hexShow :: ByteString -> String
+hexShow bs = "[" <> (go . BS.unpack $ bs) <> "]"
+  where
+    go :: [Word8] -> String
+    go = \case
+      [] -> ""
+      [w8] -> byteToHex w8
+      (w8 : w8s) -> byteToHex w8 <> ", " <> go w8s
+
+byteToHex :: Word8 -> String
+byteToHex w8
+  | w8 < 128 = "0x0" <> showHex w8 ""
+  | otherwise = "0x" <> showHex w8 ""
diff --git a/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs b/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
index 4a68bb38c4c..bdec21ac37d 100644
--- a/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
+++ b/plutus-ledger-api/src/PlutusLedgerApi/Common/Versions.hs
@@ -121,7 +121,8 @@ builtinsIntroducedIn = Map.fromList [
           ]),
   ((PlutusV3, futurePV), Set.fromList [
           AndByteString, OrByteString, XorByteString, ComplementByteString,
-          ReadBit, WriteBits, ReplicateByteString
+          ReadBit, WriteBits, ReplicateByte,
+          ShiftByteString, RotateByteString, CountSetBits, FindFirstSetBit
           ])
   ]
 
diff --git a/plutus-metatheory/src/Builtin.lagda.md b/plutus-metatheory/src/Builtin.lagda.md
index f216d83f945..ac170bb5e2c 100644
--- a/plutus-metatheory/src/Builtin.lagda.md
+++ b/plutus-metatheory/src/Builtin.lagda.md
@@ -535,7 +535,7 @@ postulate
 {-# COMPILE GHC KECCAK-256 = Hash.keccak_256 #-}
 {-# COMPILE GHC BLAKE2B-224 = Hash.blake2b_224 #-}
 
-{-# FOREIGN GHC import PlutusCore.Bitwise.Convert qualified as Convert #-}
+{-# FOREIGN GHC import PlutusCore.Bitwise qualified as Convert #-}
 {-# COMPILE GHC BStoI = Convert.byteStringToIntegerWrapper #-}
 {-# COMPILE GHC ItoBS = \e w n -> builtinResultToMaybe $ Convert.integerToByteStringWrapper e w n #-}
 
diff --git a/plutus-tx-plugin/src/PlutusTx/Compiler/Builtins.hs b/plutus-tx-plugin/src/PlutusTx/Compiler/Builtins.hs
index c8741b870ee..297f3297d41 100644
--- a/plutus-tx-plugin/src/PlutusTx/Compiler/Builtins.hs
+++ b/plutus-tx-plugin/src/PlutusTx/Compiler/Builtins.hs
@@ -283,7 +283,12 @@ builtinNames = [
     , 'Builtins.complementByteString
     , 'Builtins.readBit
     , 'Builtins.writeBits
-    , 'Builtins.replicateByteString
+    , 'Builtins.replicateByte
+
+    , 'Builtins.shiftByteString
+    , 'Builtins.rotateByteString
+    , 'Builtins.countSetBits
+    , 'Builtins.findFirstSetBit
     ]
 
 defineBuiltinTerm :: CompilingDefault uni fun m ann => Ann -> TH.Name -> PIRTerm uni fun -> m ()
@@ -448,7 +453,13 @@ defineBuiltinTerms = do
             PLC.ComplementByteString -> defineBuiltinInl 'Builtins.complementByteString
             PLC.ReadBit -> defineBuiltinInl 'Builtins.readBit
             PLC.WriteBits -> defineBuiltinInl 'Builtins.writeBits
-            PLC.ReplicateByteString -> defineBuiltinInl 'Builtins.replicateByteString
+            PLC.ReplicateByte -> defineBuiltinInl 'Builtins.replicateByte
+
+            -- Other bitwise ops
+            PLC.ShiftByteString -> defineBuiltinInl 'Builtins.shiftByteString
+            PLC.RotateByteString -> defineBuiltinInl 'Builtins.rotateByteString
+            PLC.CountSetBits -> defineBuiltinInl 'Builtins.countSetBits
+            PLC.FindFirstSetBit -> defineBuiltinInl 'Builtins.findFirstSetBit
 
 defineBuiltinTypes
     :: CompilingDefault uni fun m ann
diff --git a/plutus-tx-plugin/test/Budget/9.6/map2.uplc.golden b/plutus-tx-plugin/test/Budget/9.6/map2.uplc.golden
index e2e0f98905e..f1bf99b0f21 100644
--- a/plutus-tx-plugin/test/Budget/9.6/map2.uplc.golden
+++ b/plutus-tx-plugin/test/Budget/9.6/map2.uplc.golden
@@ -212,8 +212,8 @@ program
                                                     [ (addInteger 7 n)
                                                     , #534556454e ])
                                                , (constr 0 []) ]) ]) ]) ])))
-                       (addInteger 3 n))
-                    (addInteger 4 n))
+                       (addInteger 4 n))
+                    (addInteger 3 n))
                  (\`$dToData` `$dToData` ->
                     (\go eta -> goList (go eta))
                       (fix1
diff --git a/plutus-tx-plugin/test/Budget/9.6/map3.uplc.golden b/plutus-tx-plugin/test/Budget/9.6/map3.uplc.golden
index e2e0f98905e..f1bf99b0f21 100644
--- a/plutus-tx-plugin/test/Budget/9.6/map3.uplc.golden
+++ b/plutus-tx-plugin/test/Budget/9.6/map3.uplc.golden
@@ -212,8 +212,8 @@ program
                                                     [ (addInteger 7 n)
                                                     , #534556454e ])
                                                , (constr 0 []) ]) ]) ]) ])))
-                       (addInteger 3 n))
-                    (addInteger 4 n))
+                       (addInteger 4 n))
+                    (addInteger 3 n))
                  (\`$dToData` `$dToData` ->
                     (\go eta -> goList (go eta))
                       (fix1
diff --git a/plutus-tx/changelog.d/20240523_124052_koz.ross_bitwise_2.md b/plutus-tx/changelog.d/20240523_124052_koz.ross_bitwise_2.md
new file mode 100644
index 00000000000..72fea979f9c
--- /dev/null
+++ b/plutus-tx/changelog.d/20240523_124052_koz.ross_bitwise_2.md
@@ -0,0 +1,39 @@
+<!--
+A new scriv changelog fragment.
+
+Uncomment the section that is right (remove the HTML comment wrapper).
+-->
+
+<!--
+### Removed
+
+- A bullet item for the Removed category.
+
+-->
+### Added
+
+- Builtin wrappers for operations from [this
+  CIP](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-XXXX/CIP-XXXX.md(.
+
+### Changed
+
+- Rename `replicateByteString` to `replicateByte`
+
+<!--
+### Deprecated
+
+- A bullet item for the Deprecated category.
+
+-->
+<!--
+### Fixed
+
+- A bullet item for the Fixed category.
+
+-->
+<!--
+### Security
+
+- A bullet item for the Security category.
+
+-->
diff --git a/plutus-tx/src/PlutusTx/Builtins.hs b/plutus-tx/src/PlutusTx/Builtins.hs
index e242df14841..fdbe67fe750 100644
--- a/plutus-tx/src/PlutusTx/Builtins.hs
+++ b/plutus-tx/src/PlutusTx/Builtins.hs
@@ -117,7 +117,12 @@ module PlutusTx.Builtins (
                          , complementByteString
                          , readBit
                          , writeBits
-                         , replicateByteString
+                         , replicateByte
+                         -- * Bitwise
+                         , shiftByteString
+                         , rotateByteString
+                         , countSetBits
+                         , findFirstSetBit
                          ) where
 
 import Data.Maybe
@@ -633,7 +638,7 @@ byteOrderToBool BigEndian    = True
 byteOrderToBool LittleEndian = False
 
 -- | Convert a 'BuiltinInteger' into a 'BuiltinByteString', as described in
--- [CIP-121](https://github.com/mlabs-haskell/CIPs/tree/koz/to-from-bytestring/CIP-0121).
+-- [CIP-121](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0121).
 -- The first argument indicates the endianness of the conversion and the third
 -- argument is the integer to be converted, which must be non-negative.  The
 -- second argument must also be non-negative and it indicates the required width
@@ -651,7 +656,7 @@ integerToByteString :: ByteOrder -> Integer -> Integer -> BuiltinByteString
 integerToByteString endianness = BI.integerToByteString (toOpaque (byteOrderToBool endianness))
 
 -- | Convert a 'BuiltinByteString' to a 'BuiltinInteger', as described in
--- [CIP-121](https://github.com/mlabs-haskell/CIPs/tree/koz/to-from-bytestring/CIP-0121).
+-- [CIP-121](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0121).
 -- The first argument indicates the endianness of the conversion and the second
 -- is the bytestring to be converted.  There is no limitation on the size of
 -- the bytestring.  The empty bytestring is converted to the integer 0.
@@ -660,10 +665,35 @@ byteStringToInteger :: ByteOrder -> BuiltinByteString -> Integer
 byteStringToInteger endianness =
   BI.byteStringToInteger (toOpaque (byteOrderToBool endianness))
 
+-- Bitwise operations
+
+-- | Shift a 'BuiltinByteString', as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+{-# INLINEABLE shiftByteString #-}
+shiftByteString :: BuiltinByteString -> Integer -> BuiltinByteString
+shiftByteString = BI.shiftByteString
+
+-- | Rotate a 'BuiltinByteString', as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+{-# INLINEABLE rotateByteString #-}
+rotateByteString :: BuiltinByteString -> Integer -> BuiltinByteString
+rotateByteString = BI.rotateByteString
+
+-- | Count the set bits in a 'BuiltinByteString', as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+{-# INLINEABLE countSetBits #-}
+countSetBits :: BuiltinByteString -> Integer
+countSetBits = BI.countSetBits
+
+-- | Find the lowest index of a set bit in a 'BuiltinByteString', as per [CIP-123](https://github.com/mlabs-haskell/CIPs/blob/koz/bitwise/CIP-0123/README.md).
+--
+-- If given a 'BuiltinByteString' which consists only of zero bytes (including the empty
+-- 'BuiltinByteString', this returns @-1@.
+{-# INLINEABLE findFirstSetBit #-}
+findFirstSetBit :: BuiltinByteString -> Integer
+findFirstSetBit = BI.findFirstSetBit
+
 -- Logical operations
 
--- | Perform logical AND on two 'BuiltinByteString' arguments, as described
--- [here](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bitwiselogicaland).
+-- | Perform logical AND on two 'BuiltinByteString' arguments, as described in
+-- [CIP-122](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bitwiselogicaland).
 --
 -- The first argument indicates whether padding semantics should be used or not;
 -- if 'False', truncation semantics will be used instead.
@@ -671,9 +701,9 @@ byteStringToInteger endianness =
 -- = See also
 --
 -- * [Padding and truncation
--- semantics](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#padding-versus-truncation-semantics)
+-- semantics](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#padding-versus-truncation-semantics)
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 {-# INLINEABLE andByteString #-}
 andByteString ::
   Bool ->
@@ -683,7 +713,7 @@ andByteString ::
 andByteString b = BI.andByteString (toOpaque b)
 
 -- | Perform logical OR on two 'BuiltinByteString' arguments, as described
--- [here](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bitwiselogicalor).
+-- [here](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bitwiselogicalor).
 --
 -- The first argument indicates whether padding semantics should be used or not;
 -- if 'False', truncation semantics will be used instead.
@@ -691,9 +721,9 @@ andByteString b = BI.andByteString (toOpaque b)
 -- = See also
 --
 -- * [Padding and truncation
--- semantics](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#padding-versus-truncation-semantics)
+-- semantics](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#padding-versus-truncation-semantics)
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 {-# INLINEABLE orByteString #-}
 orByteString ::
   Bool ->
@@ -703,7 +733,7 @@ orByteString ::
 orByteString b = BI.orByteString (toOpaque b)
 
 -- | Perform logical XOR on two 'BuiltinByteString' arguments, as described
--- [here](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bitwiselogicalxor).
+-- [here](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bitwiselogicalxor).
 --
 -- The first argument indicates whether padding semantics should be used or not;
 -- if 'False', truncation semantics will be used instead.
@@ -711,9 +741,9 @@ orByteString b = BI.orByteString (toOpaque b)
 -- = See also
 --
 -- * [Padding and truncation
--- semantics](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#padding-versus-truncation-semantics)
+-- semantics](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#padding-versus-truncation-semantics)
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 {-# INLINEABLE xorByteString #-}
 xorByteString ::
   Bool ->
@@ -723,12 +753,12 @@ xorByteString ::
 xorByteString b = BI.xorByteString (toOpaque b)
 
 -- | Perform logical complement on a 'BuiltinByteString', as described
--- [here](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bitwiselogicalcomplement).
+-- [here](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bitwiselogicalcomplement).
 --
 -- = See also
 --
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 {-# INLINEABLE complementByteString #-}
 complementByteString ::
   BuiltinByteString ->
@@ -744,9 +774,9 @@ complementByteString = BI.complementByteString
 -- = See also
 --
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 -- * [Operation
--- description](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#readbit)
+-- description](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#readbit)
 {-# INLINEABLE readBit #-}
 readBit ::
   BuiltinByteString ->
@@ -763,9 +793,9 @@ readBit bs i = fromOpaque (BI.readBit bs i)
 -- = See also
 --
 -- * [Bit indexing
--- scheme](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#bit-indexing-scheme)
+-- scheme](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#bit-indexing-scheme)
 -- * [Operation
--- description](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#writebits)
+-- description](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#writebits)
 {-# INLINEABLE writeBits #-}
 writeBits ::
   BuiltinByteString ->
@@ -780,10 +810,10 @@ writeBits = BI.writeBits
 -- = See also
 --
 -- * [Operation
--- description](https://github.com/mlabs-haskell/CIPs/blob/koz/logic-ops/CIP-0122/CIP-0122.md#replicateByteString)
-{-# INLINEABLE replicateByteString #-}
-replicateByteString ::
+-- description](https://github.com/cardano-foundation/CIPs/tree/master/CIP-0122#replicateByteString)
+{-# INLINEABLE replicateByte #-}
+replicateByte ::
   Integer ->
   Integer ->
   BuiltinByteString
-replicateByteString = BI.replicateByteString
+replicateByte = BI.replicateByte
diff --git a/plutus-tx/src/PlutusTx/Builtins/Internal.hs b/plutus-tx/src/PlutusTx/Builtins/Internal.hs
index 38da315b54c..37844dd4f57 100644
--- a/plutus-tx/src/PlutusTx/Builtins/Internal.hs
+++ b/plutus-tx/src/PlutusTx/Builtins/Internal.hs
@@ -32,8 +32,7 @@ import Data.Kind (Type)
 import Data.Text as Text (Text, empty)
 import Data.Text.Encoding as Text (decodeUtf8, encodeUtf8)
 import GHC.Generics (Generic)
-import PlutusCore.Bitwise.Convert qualified as Convert
-import PlutusCore.Bitwise.Logical qualified as Logical
+import PlutusCore.Bitwise qualified as Bitwise
 import PlutusCore.Builtin (BuiltinResult (..))
 import PlutusCore.Crypto.BLS12_381.G1 qualified as BLS12_381.G1
 import PlutusCore.Crypto.BLS12_381.G2 qualified as BLS12_381.G2
@@ -694,7 +693,7 @@ integerToByteString
     -> BuiltinInteger
     -> BuiltinByteString
 integerToByteString (BuiltinBool endiannessArg) paddingArg input =
-  case Convert.integerToByteStringWrapper endiannessArg paddingArg input of
+  case Bitwise.integerToByteStringWrapper endiannessArg paddingArg input of
     BuiltinSuccess bs              -> BuiltinByteString bs
     BuiltinSuccessWithLogs logs bs -> traceAll logs $ BuiltinByteString bs
     BuiltinFailure logs err        -> traceAll (logs <> pure (display err)) $
@@ -706,7 +705,40 @@ byteStringToInteger
     -> BuiltinByteString
     -> BuiltinInteger
 byteStringToInteger (BuiltinBool statedEndianness) (BuiltinByteString input) =
-  Convert.byteStringToIntegerWrapper statedEndianness input
+  Bitwise.byteStringToIntegerWrapper statedEndianness input
+
+{-
+BITWISE
+-}
+
+{-# NOINLINE shiftByteString #-}
+shiftByteString ::
+  BuiltinByteString ->
+  BuiltinInteger ->
+  BuiltinByteString
+shiftByteString (BuiltinByteString bs) =
+  BuiltinByteString . Bitwise.shiftByteString bs . fromIntegral
+
+{-# NOINLINE rotateByteString #-}
+rotateByteString ::
+  BuiltinByteString ->
+  BuiltinInteger ->
+  BuiltinByteString
+rotateByteString (BuiltinByteString bs) =
+  BuiltinByteString . Bitwise.rotateByteString bs . fromIntegral
+
+{-# NOINLINE countSetBits #-}
+countSetBits ::
+  BuiltinByteString ->
+  BuiltinInteger
+countSetBits (BuiltinByteString bs) = fromIntegral . Bitwise.countSetBits $ bs
+
+{-# NOINLINE findFirstSetBit #-}
+findFirstSetBit ::
+  BuiltinByteString ->
+  BuiltinInteger
+findFirstSetBit (BuiltinByteString bs) =
+  fromIntegral . Bitwise.findFirstSetBit $ bs
 
 {-
 LOGICAL
@@ -719,7 +751,7 @@ andByteString ::
   BuiltinByteString ->
   BuiltinByteString
 andByteString (BuiltinBool isPaddingSemantics) (BuiltinByteString data1) (BuiltinByteString data2) =
-  BuiltinByteString . Logical.andByteString isPaddingSemantics data1 $ data2
+  BuiltinByteString . Bitwise.andByteString isPaddingSemantics data1 $ data2
 
 {-# NOINLINE orByteString #-}
 orByteString ::
@@ -728,7 +760,7 @@ orByteString ::
   BuiltinByteString ->
   BuiltinByteString
 orByteString (BuiltinBool isPaddingSemantics) (BuiltinByteString data1) (BuiltinByteString data2) =
-  BuiltinByteString . Logical.orByteString isPaddingSemantics data1 $ data2
+  BuiltinByteString . Bitwise.orByteString isPaddingSemantics data1 $ data2
 
 {-# NOINLINE xorByteString #-}
 xorByteString ::
@@ -737,14 +769,14 @@ xorByteString ::
   BuiltinByteString ->
   BuiltinByteString
 xorByteString (BuiltinBool isPaddingSemantics) (BuiltinByteString data1) (BuiltinByteString data2) =
-  BuiltinByteString . Logical.xorByteString isPaddingSemantics data1 $ data2
+  BuiltinByteString . Bitwise.xorByteString isPaddingSemantics data1 $ data2
 
 {-# NOINLINE complementByteString #-}
 complementByteString ::
   BuiltinByteString ->
   BuiltinByteString
 complementByteString (BuiltinByteString bs) =
-  BuiltinByteString . Logical.complementByteString $ bs
+  BuiltinByteString . Bitwise.complementByteString $ bs
 
 {-# NOINLINE readBit #-}
 readBit ::
@@ -752,7 +784,7 @@ readBit ::
   BuiltinInteger ->
   BuiltinBool
 readBit (BuiltinByteString bs) i =
-  case Logical.readBit bs (fromIntegral i) of
+  case Bitwise.readBit bs (fromIntegral i) of
     BuiltinFailure logs err -> traceAll (logs <> pure (display err)) $
       Haskell.error "readBit errored."
     BuiltinSuccess b -> BuiltinBool b
@@ -765,19 +797,19 @@ writeBits ::
   BuiltinByteString
 writeBits (BuiltinByteString bs) (BuiltinList xs) =
   let unwrapped = fmap (\(BuiltinPair (i, BuiltinBool b)) -> (i, b)) xs in
-    case Logical.writeBits bs unwrapped of
+    case Bitwise.writeBits bs unwrapped of
       BuiltinFailure logs err -> traceAll (logs <> pure (display err)) $
         Haskell.error "writeBits errored."
       BuiltinSuccess bs' -> BuiltinByteString bs'
       BuiltinSuccessWithLogs logs bs' -> traceAll logs $ BuiltinByteString bs'
 
-{-# NOINLINE replicateByteString #-}
-replicateByteString ::
+{-# NOINLINE replicateByte #-}
+replicateByte ::
   BuiltinInteger ->
   BuiltinInteger ->
   BuiltinByteString
-replicateByteString n w8 =
-  case Logical.replicateByteString (fromIntegral n) (fromIntegral w8) of
+replicateByte n w8 =
+  case Bitwise.replicateByte (fromIntegral n) (fromIntegral w8) of
     BuiltinFailure logs err -> traceAll (logs <> pure (display err)) $
       Haskell.error "byteStringReplicate errored."
     BuiltinSuccess bs -> BuiltinByteString bs