Discussion about this post

User's avatar
Kamaroyl's avatar

Here's what claude 4.1 gave me with the same prompt:

```

# Unsigned integer divide by 6 for RISC-V RV32I/E and RV64I

# Input: a0 = dividend

# Output: a0 = quotient (dividend / 6)

#

# Algorithm: Multiplication by reciprocal using magic number

# For division by 6, we use the multiplicative inverse approach

# Magic number for 32-bit: 0xAAAAAAAB (with shift)

# Magic number for 64-bit: 0xAAAAAAAAAAAAAAAB (with shift)

.text

.globl udiv6

.align 2

udiv6:

.if CPU_BITS == 64

# 64-bit implementation

# Algorithm: q = (n * 0xAAAAAAAAAAAAAAAB) >> 65

# Since we can't multiply, we decompose the multiplication

# into shifts and adds

# Save original value

mv t0, a0

# Compute n * 0xAAAAAAAAAAAAAAAB

# 0xAAAAAAAAAAAAAAAB in binary has pattern: 1010...1011

# This is (2^64 * 2 + 2^64 - 2^64/3) / 3

# We can compute n * 0xAAA...AAB using:

# result = n + (n << 1) + (n << 3) + (n << 4) + ... pattern

# More efficient method: n/6 = (n * 0xAAAAAAAAAAAAAAAB) >> 65

# Since we need >> 65, we actually compute the high 64 bits of

# a 128-bit multiplication

# Approximation: n/6 ≈ (n >> 1) - (n >> 3) + (n >> 5) - (n >> 7) + ...

# Better approach: n/6 = (n * 43) >> 8 for small n

# For full range: use the fact that n/6 = (n + (n>>2) + (n>>4) + ...) >> 3

# Exact algorithm without multiply:

# q = n >> 1

srli t1, a0, 1 # t1 = n >> 1

# q = q >> 1

srli t2, t1, 1 # t2 = n >> 2

# q = q + (n >> 2)

add t1, t1, t2 # t1 = (n >> 1) + (n >> 2)

# Continue the pattern to build up the reciprocal

srli t2, t2, 2 # t2 = n >> 4

add t1, t1, t2 # t1 += n >> 4

srli t2, t2, 2 # t2 = n >> 6

add t1, t1, t2 # t1 += n >> 6

srli t2, t2, 2 # t2 = n >> 8

add t1, t1, t2 # t1 += n >> 8

srli t2, t2, 2 # t2 = n >> 10

add t1, t1, t2 # t1 += n >> 10

srli t2, t2, 2 # t2 = n >> 12

add t1, t1, t2 # t1 += n >> 12

srli t2, t2, 2 # t2 = n >> 14

add t1, t1, t2 # t1 += n >> 14

srli t2, t2, 2 # t2 = n >> 16

add t1, t1, t2 # t1 += n >> 16

srli t2, t2, 2 # t2 = n >> 18

add t1, t1, t2 # t1 += n >> 18

srli t2, t2, 2 # t2 = n >> 20

add t1, t1, t2 # t1 += n >> 20

# Continue for more precision (64-bit needs more terms)

srli t2, t2, 2 # t2 = n >> 22

add t1, t1, t2

srli t2, t2, 2 # t2 = n >> 24

add t1, t1, t2

srli t2, t2, 2 # t2 = n >> 26

add t1, t1, t2

srli t2, t2, 2 # t2 = n >> 28

add t1, t1, t2

srli t2, t2, 2 # t2 = n >> 30

add t1, t1, t2

srli t2, t2, 2 # t2 = n >> 32

add t1, t1, t2

# More terms for 64-bit precision

srli t2, a0, 34

add t1, t1, t2

srli t2, a0, 36

add t1, t1, t2

srli t2, a0, 38

add t1, t1, t2

srli t2, a0, 40

add t1, t1, t2

srli t2, a0, 42

add t1, t1, t2

srli t2, a0, 44

add t1, t1, t2

srli t2, a0, 46

add t1, t1, t2

srli t2, a0, 48

add t1, t1, t2

srli t2, a0, 50

add t1, t1, t2

srli t2, a0, 52

add t1, t1, t2

srli t2, a0, 54

add t1, t1, t2

srli t2, a0, 56

add t1, t1, t2

srli t2, a0, 58

add t1, t1, t2

srli t2, a0, 60

add t1, t1, t2

srli t2, a0, 62

add t1, t1, t2

# Final adjustment shift

srli a0, t1, 2 # Final shift to get quotient

.else

# 32-bit implementation (also works for RV32E)

# Uses only registers a0-a5, t0-t2 which are available in RV32E

# Algorithm: q = (n * 0xAAAAAAAB) >> 33

# We compute this without multiplication

# The pattern 0xAAAAAAAB = 10101010...10101011

# n/6 can be computed as sum of shifts

# More efficient for 32-bit:

# n/6 ≈ (n >> 1) - (n >> 3) + (n >> 5) - ... (alternating series)

# But we use the positive series for simplicity

# Compute n * (2^32 + 2^32/3) / 2^33

# Which equals (n + n/3) / 2

# Step 1: Compute n/3 approximation using series

# n/3 ≈ (n >> 2) + (n >> 4) + (n >> 6) + (n >> 8) + ...

srli t0, a0, 2 # t0 = n >> 2

srli t1, a0, 4 # t1 = n >> 4

add t0, t0, t1 # t0 = (n >> 2) + (n >> 4)

srli t1, a0, 6 # t1 = n >> 6

add t0, t0, t1 # t0 += n >> 6

srli t1, a0, 8 # t1 = n >> 8

add t0, t0, t1 # t0 += n >> 8

srli t1, a0, 10 # t1 = n >> 10

add t0, t0, t1 # t0 += n >> 10

srli t1, a0, 12 # t1 = n >> 12

add t0, t0, t1 # t0 += n >> 12

srli t1, a0, 14 # t1 = n >> 14

add t0, t0, t1 # t0 += n >> 14

srli t1, a0, 16 # t1 = n >> 16

add t0, t0, t1 # t0 += n >> 16

srli t1, a0, 18 # t1 = n >> 18

add t0, t0, t1 # t0 += n >> 18

srli t1, a0, 20 # t1 = n >> 20

add t0, t0, t1 # t0 += n >> 20

srli t1, a0, 22 # t1 = n >> 22

add t0, t0, t1 # t0 += n >> 22

srli t1, a0, 24 # t1 = n >> 24

add t0, t0, t1 # t0 += n >> 24

srli t1, a0, 26 # t1 = n >> 26

add t0, t0, t1 # t0 += n >> 26

srli t1, a0, 28 # t1 = n >> 28

add t0, t0, t1 # t0 += n >> 28

srli t1, a0, 30 # t1 = n >> 30

add t0, t0, t1 # t0 += n >> 30

# Step 2: Now t0 ≈ n/3, compute (n + n/3) / 2

add t0, t0, a0 # t0 = n + n/3

srli a0, t0, 1 # a0 = (n + n/3) / 2 = n/6

.endif

ret

```

2 more comments...

No posts

Ready for more?