diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
index 1200912148..bbb0c05865 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
@@ -69,6 +69,8 @@ def MappingPass : Pass<"place-and-route", "mlir::ModuleOp"> {
                         "the forwards and backwards mechanism.">,
                  Option<"seed", "seed", "std::size_t", "42",
                         "A seed used for randomization.">];
+  let statistics = [Statistic<"numSwaps", "num-inserted-swaps",
+                              "The number of inserted SWAPs">];
 }
 
 #endif // MLIR_DIALECT_QCO_TRANSFORMS_PASSES_TD
diff --git a/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp b/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
index dcb59807d3..ef861b88b0 100644
--- a/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
+++ b/mlir/lib/Dialect/QCO/Transforms/Mapping/Mapping.cpp
@@ -18,8 +18,9 @@
 #include <llvm/ADT/STLExtras.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/TypeSwitch.h>
-#include <llvm/Support/Debug.h>
+#include <llvm/Support/Allocator.h>
 #include <llvm/Support/ErrorHandling.h>
+#include <mlir/Analysis/TopologicalSortUtils.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
 #include <mlir/IR/Block.h>
 #include <mlir/IR/BuiltinOps.h>
@@ -38,6 +39,7 @@
 #include <cstdint>
 #include <functional>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <queue>
@@ -70,6 +72,8 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
    */
   enum class Direction : std::uint8_t { Forward, Backward };
 
+  struct LayoutInfo;
+
   /**
    * @brief A qubit layout that maps program and hardware indices without
    * storing Values. Used for efficient memory usage when Value tracking isn't
@@ -191,18 +195,6 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       return programToHardware_.size();
     }
 
-    void dump() {
-      llvm::dbgs() << "prog= ";
-      for (std::size_t i = 0; i < nqubits(); ++i) {
-        llvm::dbgs() << i << " ";
-      }
-      llvm::dbgs() << "\nhw=   ";
-      for (std::size_t i = 0; i < nqubits(); ++i) {
-        llvm::dbgs() << programToHardware_[i] << ' ';
-      }
-      llvm::dbgs() << '\n';
-    }
-
   protected:
     /**
      * @brief Maps a program qubit index to its hardware index.
@@ -215,10 +207,43 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     SmallVector<IndexType> hardwareToProgram_;
 
   private:
+    friend struct MappingPass::LayoutInfo;
+
+    Layout() = default;
     explicit Layout(const std::size_t nqubits)
         : programToHardware_(nqubits), hardwareToProgram_(nqubits) {}
   };
 
+  /**
+   * @brief Required to use Layout as a key for LLVM maps and sets.
+   */
+  class LayoutInfo {
+    using Info = DenseMapInfo<SmallVector<IndexType>>;
+
+  public:
+    static Layout getEmptyKey() {
+      Layout l;
+      l.programToHardware_ = Info::getEmptyKey();
+      l.hardwareToProgram_ = Info::getEmptyKey();
+      return l;
+    }
+
+    static Layout getTombstoneKey() {
+      Layout l;
+      l.programToHardware_ = Info::getTombstoneKey();
+      l.hardwareToProgram_ = Info::getTombstoneKey();
+      return l;
+    }
+
+    static unsigned getHashValue(const Layout& l) {
+      return Info::getHashValue(l.programToHardware_);
+    }
+
+    static bool isEqual(const Layout& a, const Layout& b) {
+      return Info::isEqual(a.programToHardware_, b.programToHardware_);
+    }
+  };
+
   /**
    * @brief Parameters influencing the behavior of the A* search algorithm.
    */
@@ -240,30 +265,34 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
    * @brief Describes a node in the A* search graph.
    */
   struct Node {
-    SmallVector<IndexGate> sequence;
+    struct ComparePointer {
+      bool operator()(const Node* lhs, const Node* rhs) const {
+        return lhs->f > rhs->f;
+      }
+    };
+
     Layout layout;
+    IndexGate swap;
+    Node* parent;
+    std::size_t depth;
     float f;
 
     /**
      * @brief Construct a root node with the given layout. Initialize the
      * sequence with an empty vector and set the cost to zero.
      */
-    explicit Node(Layout layout) : layout(std::move(layout)), f(0) {}
+    explicit Node(Layout layout)
+        : layout(std::move(layout)), parent(nullptr), depth(0), f(0) {}
 
     /**
      * @brief Construct a non-root node from its parent node. Apply the given
-     * swap to the layout of the parent node and evaluate the cost.
+     * swap to the layout of the parent node.
      */
-    Node(const Node& parent, IndexGate swap, ArrayRef<Layer> layers,
+    Node(Node* parent, IndexGate swap, ArrayRef<Layer> layers,
          const Architecture& arch, const Parameters& params)
-        : sequence(parent.sequence), layout(parent.layout), f(0) {
-      // Apply node-specific swap to given layout.
+        : layout(parent->layout), swap(swap), parent(parent),
+          depth(parent->depth + 1), f(0) {
       layout.swap(swap.first, swap.second);
-
-      // Add swap to sequence.
-      sequence.emplace_back(swap);
-
-      // Evaluate cost function.
       f = g(params.alpha) + h(layers, arch, params); // NOLINT
     }
 
@@ -279,12 +308,6 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       });
     }
 
-    /**
-     * @returns true iff. the costs of this node are higher than the one of @p
-     * rhs.
-     */
-    [[nodiscard]] bool operator>(const Node& rhs) const { return f > rhs.f; }
-
   private:
     /**
      * @brief Calculate the path cost for the A* search algorithm.
@@ -293,7 +316,7 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
      * SWAPs.
      */
     [[nodiscard]] float g(float alpha) const {
-      return alpha * static_cast<float>(sequence.size());
+      return alpha * static_cast<float>(depth);
     }
 
     /**
@@ -319,8 +342,6 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     }
   };
 
-  using MinQueue = std::priority_queue<Node, std::vector<Node>, std::greater<>>;
-
   struct [[nodiscard]] TrialResult {
     explicit TrialResult(Layout layout) : layout(std::move(layout)) {}
 
@@ -380,10 +401,9 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     DenseMap<Operation*, std::size_t> refCount;
   };
 
-public:
+protected:
   using MappingPassBase::MappingPassBase;
 
-protected:
   void runOnOperation() override {
     std::mt19937_64 rng{this->seed};
     IRRewriter rewriter(&getContext());
@@ -555,9 +575,9 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
    * @brief Perform A* search to find a sequence of SWAPs that makes the
    * two-qubit operations inside the first layer (the front) executable.
    * @details
-   * The iteration budget is then b^{3}, which corresponds to
-   * exhausting all paths of length up to b^{2} in a search tree with branching
-   * factor b. A hard cap prevents impractical runtimes on larger architectures.
+   * The iteration budget is b^{3} node expansions, i.e. roughly a depth-3
+   * search in a tree with branching factor b. A hard cap prevents impractical
+   * runtimes on larger architectures.
    *
    * The branching factor b of the A* search is the product of the
    * architecture's maximum qubit degree and the maximum number of two-qubit
@@ -575,43 +595,70 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
     const std::size_t b = arch.maxDegree() * ((arch.nqubits() + 1) / 2);
     const std::size_t budget = std::min(b * b * b, cap);
 
-    Node root(layout);
-    if (root.isGoal(layers.front(), arch)) {
+    llvm::SpecificBumpPtrAllocator<Node> arena;
+    std::priority_queue<Node*, std::vector<Node*>, Node::ComparePointer>
+        frontier;
+
+    Node* root = std::construct_at(arena.Allocate(), layout);
+    if (root->isGoal(layers.front(), arch)) {
       return SmallVector<IndexGate>{};
     }
-
-    MinQueue frontier{};
     frontier.emplace(root);
+
+    DenseMap<Layout, std::size_t, LayoutInfo> bestDepth;
     DenseSet<IndexGate> expansionSet;
 
     std::size_t i = 0;
     while (!frontier.empty() && i < budget) {
-      Node curr = frontier.top();
+      Node* curr = frontier.top();
       frontier.pop();
 
-      if (curr.isGoal(layers.front(), arch)) {
-        return curr.sequence;
+      // Multiple sequences of SWAPs can lead to the same layout and the same
+      // layout creates the same child-nodes. Thus, if we've seen a layout
+      // already at a lower depth don't reexpand the current node (and hence
+      // recreate the same child nodes).
+
+      const auto [it, inserted] =
+          bestDepth.try_emplace(curr->layout, curr->depth);
+      if (!inserted) {
+        const auto otherDepth = it->getSecond();
+        if (curr->depth >= otherDepth) {
+          ++i;
+          continue;
+        }
+
+        it->second = curr->depth;
       }
 
-      // Given a layout, create child-nodes for each possible SWAP between
-      // two neighbouring hardware qubits.
+      // If the currently visited node is a goal node, reconstruct the sequence
+      // of SWAPs from this node to the root.
 
-      expansionSet.clear();
-      if (!curr.sequence.empty()) {
-        expansionSet.insert(curr.sequence.back());
+      if (curr->isGoal(layers.front(), arch)) {
+        SmallVector<IndexGate> seq(curr->depth);
+        std::size_t j = seq.size() - 1;
+        for (Node* n = curr; n->parent != nullptr; n = n->parent) {
+          seq[j] = n->swap;
+          --j;
+        }
+        return seq;
       }
 
+      // Given a layout, create child-nodes for each possible SWAP
+      // between two neighbouring hardware qubits.
+
+      expansionSet.clear();
       for (const IndexGate& gate : layers.front()) {
         for (const auto prog : {gate.first, gate.second}) {
-          const auto hw0 = curr.layout.getHardwareIndex(prog);
+          const auto hw0 = curr->layout.getHardwareIndex(prog);
           for (const auto hw1 : arch.neighboursOf(hw0)) {
-            /// Ensure consistent hashing/comparison.
+            // Ensure consistent hashing/comparison.
             const IndexGate swap = std::minmax(hw0, hw1);
             if (!expansionSet.insert(swap).second) {
               continue;
             }
 
-            frontier.emplace(curr, swap, layers, arch, params);
+            frontier.emplace(std::construct_at(arena.Allocate(), curr, swap,
+                                               layers, arch, params));
           }
         }
       }
@@ -756,9 +803,8 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
    * @details Replace the dynamic with static qubits ("placement") and inserts
    * the SWAPs of the trial result into the IR.
    */
-  static void commitTrial(const TrialResult& result,
-                          ArrayRef<QubitValue> dynQubits, Region& funcBody,
-                          IRRewriter& rewriter) {
+  void commitTrial(const TrialResult& result, ArrayRef<QubitValue> dynQubits,
+                   Region& funcBody, IRRewriter& rewriter) {
     // Helper function that advances the iterator to the input qubit (the
     // operation producing it) of a deallocation or two-qubit op.
     const auto advFront = [](WireIterator& it) {
@@ -792,20 +838,9 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
 
       // Apply the sequence of SWAPs and rewire the qubit SSA values.
       for (const auto& [hw0, hw1] : swaps) {
-        Operation* op0 = wires[hw0].operation();
-        Operation* op1 = wires[hw1].operation();
         const auto in0 = wires[hw0].qubit();
         const auto in1 = wires[hw1].qubit();
 
-        // Reorder to avoid SSA dominance issues.
-        assert(op0->getBlock()->isOpOrderValid() &&
-               "An invalid op order leads to a significant runtime overhead.");
-        if (op0->isBeforeInBlock(op1)) {
-          rewriter.setInsertionPointAfterValue(in1);
-        } else {
-          rewriter.setInsertionPointAfterValue(in0);
-        }
-
         auto op = SWAPOp::create(rewriter, rewriter.getUnknownLoc(), in0, in1);
         const auto out0 = op.getQubit0Out();
         const auto out1 = op.getQubit1Out();
@@ -842,7 +877,10 @@ struct MappingPass : impl::MappingPassBase<MappingPass> {
       }
 
       ready.clear(); // Prepare for next iteration.
+      this->numSwaps += swaps.size();
     }
+
+    for_each(funcBody.getBlocks(), [](Block& b) { sortTopologically(&b); });
   }
 };
 
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp b/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
index 1ab817660a..f46790f75a 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
+++ b/mlir/unittests/Dialect/QCO/Transforms/Mapping/test_mapping.cpp
@@ -111,7 +111,7 @@ class MappingPassTest : public testing::Test,
                                                               .alpha = 1,
                                                               .lambda = 0.85,
                                                               .niterations = 2,
-                                                              .ntrials = 8,
+                                                              .ntrials = 16,
                                                               .seed = 1337}));
     pm.addPass(createQCOToQC());
     auto res = pm.run(*moduleOp);