diff --git a/Makefile b/Makefile
index 27f98f21..39830516 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ clean-deps:
 SW_HEX := sw/bin/helloworld.hex
 
 $(SW_HEX): sw/*.c sw/*.h sw/*.S sw/*.ld
-	$(MAKE) -C sw/ compile
+	$(MAKE) -C sw/ compile DEFINES=$(DEFINES)
 
 ## Build all top-level programs in sw/
 software: $(SW_HEX)
diff --git a/rtl/tb_croc_soc.sv b/rtl/tb_croc_soc.sv
index b87f6646..06500d87 100644
--- a/rtl/tb_croc_soc.sv
+++ b/rtl/tb_croc_soc.sv
@@ -438,6 +438,11 @@ module tb_croc_soc #(
     /////////////////
 
     logic [31:0] tb_data;
+    int unsigned cycle_count;
+
+    always @(posedge clk) begin
+      cycle_count <= cycle_count + 1;
+    end
 
     initial begin
         $timeformat(-9, 0, "ns", 12); // 1: scale (ns=-9), 2: decimals, 3: suffix, 4: print-field width
@@ -470,9 +475,11 @@ module tb_croc_soc #(
         // resume core
         jtag_resume();
 
+        cycle_count = 0;
         // wait for non-zero return value (written into core status register)
         $display("@%t | [CORE] Wait for end of code...", $time);
         jtag_wait_for_eoc(tb_data);
+        $display("Program cycles: %d", cycle_count);
 
         // finish simulation
         repeat(50) @(posedge clk);
diff --git a/sw/Makefile b/sw/Makefile
index f5b65cd9..52b04225 100644
--- a/sw/Makefile
+++ b/sw/Makefile
@@ -12,7 +12,7 @@ INCDIR  ?= lib/inc
 # Toolchain
 
 RISCV_XLEN    ?= 32
-RISCV_MARCH   ?= rv$(RISCV_XLEN)i_zicsr
+RISCV_MARCH   ?= rv$(RISCV_XLEN)im_zicsr
 RISCV_MABI    ?= ilp32
 RISCV_PREFIX  ?= riscv64-unknown-elf-
 RISCV_CC      ?= $(RISCV_PREFIX)gcc
@@ -25,7 +25,8 @@ RISCV_LD      ?= $(RISCV_PREFIX)ld
 RISCV_STRIP   ?= $(RISCV_PREFIX)strip
 
 RISCV_FLAGS    ?= -march=$(RISCV_MARCH) -mabi=$(RISCV_MABI) -mcmodel=medany -static -std=gnu99 -Os -nostdlib -fno-builtin -ffreestanding
-RISCV_CCFLAGS  ?= $(RISCV_FLAGS) -Iinclude -I$(INCDIR) -I$(CURDIR)
+DEFINES ?=
+RISCV_CCFLAGS  ?= $(RISCV_FLAGS) -Iinclude -I$(INCDIR) -I$(CURDIR) $(DEFINES)
 RISCV_LDFLAGS  ?= -static -nostartfiles -lm -lgcc -lc $(RISCV_FLAGS)
 
 # all
diff --git a/sw/dot_product.c b/sw/dot_product.c
new file mode 100644
index 00000000..37728bb1
--- /dev/null
+++ b/sw/dot_product.c
@@ -0,0 +1,47 @@
+#include "mac.h"
+#include <stdint.h>
+
+int32_t dot_product(const int32_t *x, const int32_t *y, int n) {
+  int32_t acc = 0;
+  for (int i = 0; i < n; ++i) {
+    MAC(acc, x[i], y[i]);
+  }
+  return acc;
+}
+static int32_t a[200] = {
+    1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,
+    16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
+    31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
+    46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
+    61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
+    76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
+    91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105,
+    106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+    121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
+    151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
+    166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
+    181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
+    196, 197, 198, 199, 200};
+
+static int32_t b[200] = {
+    200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186,
+    185, 184, 183, 182, 181, 180, 179, 178, 177, 176, 175, 174, 173, 172, 171,
+    170, 169, 168, 167, 166, 165, 164, 163, 162, 161, 160, 159, 158, 157, 156,
+    155, 154, 153, 152, 151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141,
+    140, 139, 138, 137, 136, 135, 134, 133, 132, 131, 130, 129, 128, 127, 126,
+    125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111,
+    110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99,  98,  97,  96,
+    95,  94,  93,  92,  91,  90,  89,  88,  87,  86,  85,  84,  83,  82,  81,
+    80,  79,  78,  77,  76,  75,  74,  73,  72,  71,  70,  69,  68,  67,  66,
+    65,  64,  63,  62,  61,  60,  59,  58,  57,  56,  55,  54,  53,  52,  51,
+    50,  49,  48,  47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,
+    35,  34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,
+    20,  19,  18,  17,  16,  15,  14,  13,  12,  11,  10,  9,   8,   7,   6,
+    5,   4,   3,   2,   1};
+
+int main(void) {
+  int32_t expected = 1353400;
+
+  return dot_product(a, b, 200) == expected ? 1 : -1;
+}
diff --git a/sw/lib/inc/mac.h b/sw/lib/inc/mac.h
index 1e84165e..00e0e1c2 100644
--- a/sw/lib/inc/mac.h
+++ b/sw/lib/inc/mac.h
@@ -6,8 +6,12 @@
 #define MAC_FUNCT3 0x0
 #define MAC_FUNCT7 0x9
 // R type for gnu assembler: opcode, func3, func7, rd, rs1, rs2
+#ifndef BASELINE
 #define MAC(a, b, c)                                                           \
   asm volatile(".insn r %1, %2, %3, %0, %4, %5"                                \
                : "+&r"(a)                                                      \
                : "i"(MAC_OPCODE), "i"(MAC_FUNCT3), "i"(MAC_FUNCT7), "r"(b),    \
                  "r"(c))
+#else
+#define MAC(a, b, c) ((a) += (b) * (c))
+#endif
diff --git a/sw/matrix_vector_mul.c b/sw/matrix_vector_mul.c
new file mode 100644
index 00000000..accdefce
--- /dev/null
+++ b/sw/matrix_vector_mul.c
@@ -0,0 +1,72 @@
+#include "mac.h"
+#include <stdint.h>
+
+// Your dot_product function, as in your example
+int32_t dot_product(const int32_t *x, const int32_t *y, int n) {
+  int32_t acc = 0;
+  for (int i = 0; i < n; ++i) {
+    MAC(acc, x[i], y[i]);
+  }
+  return acc;
+}
+
+// 5x100 matrix, mat[r][c] = (r+1) * (c+1)
+static const int32_t mat[5][100] = {
+    {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+     35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+     69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+     86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100},
+    {2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+     32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,
+     62,  64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,
+     92,  94,  96,  98,  100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120,
+     122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150,
+     152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180,
+     182, 184, 186, 188, 190, 192, 194, 196, 198, 200},
+    {3,   6,   9,   12,  15,  18,  21,  24,  27,  30,  33,  36,  39,  42,  45,
+     48,  51,  54,  57,  60,  63,  66,  69,  72,  75,  78,  81,  84,  87,  90,
+     93,  96,  99,  102, 105, 108, 111, 114, 117, 120, 123, 126, 129, 132, 135,
+     138, 141, 144, 147, 150, 153, 156, 159, 162, 165, 168, 171, 174, 177, 180,
+     183, 186, 189, 192, 195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225,
+     228, 231, 234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270,
+     273, 276, 279, 282, 285, 288, 291, 294, 297, 300},
+    {4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
+     64,  68,  72,  76,  80,  84,  88,  92,  96,  100, 104, 108, 112, 116, 120,
+     124, 128, 132, 136, 140, 144, 148, 152, 156, 160, 164, 168, 172, 176, 180,
+     184, 188, 192, 196, 200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240,
+     244, 248, 252, 256, 260, 264, 268, 272, 276, 280, 284, 288, 292, 296, 300,
+     304, 308, 312, 316, 320, 324, 328, 332, 336, 340, 344, 348, 352, 356, 360,
+     364, 368, 372, 376, 380, 384, 388, 392, 396, 400},
+    {5,   10,  15,  20,  25,  30,  35,  40,  45,  50,  55,  60,  65,  70,  75,
+     80,  85,  90,  95,  100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150,
+     155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225,
+     230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300,
+     305, 310, 315, 320, 325, 330, 335, 340, 345, 350, 355, 360, 365, 370, 375,
+     380, 385, 390, 395, 400, 405, 410, 415, 420, 425, 430, 435, 440, 445, 450,
+     455, 460, 465, 470, 475, 480, 485, 490, 495, 500}};
+
+// 100-element vector, vec[c] = c+1
+static const int32_t vec[100] = {
+    1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+    18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+    35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+    69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+    86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100};
+
+// Compute matrix-vector multiplication using dot_product
+void matvec(const int32_t m[5][100], const int32_t v[100], int32_t out[5]) {
+  for (int r = 0; r < 5; ++r) {
+    out[r] = dot_product(m[r], v, 100);
+  }
+}
+
+int main(void) {
+  int32_t out[5];
+
+  matvec(mat, vec, out);
+
+  return out[0] == 338350 ? 1 : -1;
+}
diff --git a/sw/simple-mac.c b/sw/simple-mac.c
index 10deeda1..8c9868c8 100644
--- a/sw/simple-mac.c
+++ b/sw/simple-mac.c
@@ -1,14 +1,45 @@
 #include "mac.h"
 #include <stdint.h>
 
+#define MIN_INT (1 << 31)
+#define MAX_INT (~MIN_INT)
+
 static void mac_baseline(int32_t *a, int32_t b, int32_t c) { *a += b * c; }
 
-int main() {
-  int32_t a_base = 9;
-  int32_t a_mac = a_base;
-  int32_t b = 3;
-  int32_t c = 5;
-  mac_baseline(&a_base, b, c);
+int failed_case = -1;
+int test_case = 0;
+static int check_mac(int32_t a, int32_t b, int32_t c) {
+  int32_t a_mac = a;
+  ++test_case;
+  mac_baseline(&a, b, c);
   MAC(a_mac, b, c);
-  return a_base == a_mac ? 1 : -1;
+  if (a != a_mac && failed_case == -1) {
+    failed_case = test_case;
+  }
+}
+int main() {
+
+  // Standard positive numbers
+  check_mac(5, 10, 20);
+  // Zero multiplier
+  check_mac(100, 0, 50);
+  // Zero multiplicand
+  check_mac(10, 50, 0);
+  // Negative multiplicand
+  check_mac(7, 3, -4);
+  // Negative multiplier
+  check_mac(15, -2, 8);
+  // All zeros
+  check_mac(0, 0, 0);
+  // Large numbers
+  check_mac(100000, 20000, 3000);
+  // Negative acc
+  check_mac(-20, 2, 15);
+  // max positive
+  check_mac(0, MAX_INT, 1);
+  // max negative
+  check_mac(0, 1, MIN_INT);
+
+  // returns -1 on success, the number of the failed test case on failure
+  return failed_case;
 }