Coding and general discussion relating to the compiler
Moderators: David Barker, Jerry Messina
-
Jerry Messina
- Swordfish Developer
- Posts: 1473
- Joined: Fri Jan 30, 2009 6:27 pm
- Location: US
Post
by Jerry Messina » Wed Sep 24, 2014 7:17 pm
The PIC18 has a built-in hardware multiplier that can be used to accelerate multiplication operations.
The 8x8 multiplier is used by SF when multipying two bytes together, but when you multiply anything
larger than that it uses a software algorithm, so it's much slower.
Here's a routine that takes advantage of the multiplier to do a 16x16 operation. As you can see from some of
the comments, it's about 10x as fast as the native multiply, even with the overhead of the function call
Code: Select all
//
// umul16x16() - perform unsigned 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit result
// the method is taken from the examples found in the
// hardware multiplier section of the datasheet
//
function umul16x16(arg1 as word, arg2 as word) as longword
structure m32_t
r0 as byte
r1 as byte
r2 as byte
r3 as byte
w as word // extra temp space (also used by r24 type)
// aliases
w0 as r0.asword
w1 as r2.asword
dw as r0.aslongword
r24 as r1.aslongword
end structure
dim d as m32_t
d.w0 = arg1.bytes(0) * arg2.bytes(0) // arg1L * arg2L
d.w1 = arg1.bytes(1) * arg2.bytes(1) // arg1H * arg2H
d.w = arg1.bytes(0) * arg2.bytes(1) // arg1L * arg2H
d.r24 = d.r24 + d.w // add cross products
d.w = arg1.bytes(1) * arg2.bytes(0) // arg1H * arg2L
d.r24 = d.r24 + d.w // add cross products
result = d.dw
end function
// examples
Dim A As Word
Dim B As Word
Dim C1, C2 As LongWord
A = 2
B = 1
C1 = A * B // 65us @ 32MHz
C2 = umul16x16(A, B) // 8us @ 32MHz
A = 2048
B = 1024
C1 = A * B // 65us @ 32MHz
C2 = umul16x16(A, B) // 8us @ 32MHz
A = 65432
B = 65431
C1 = A * B // 81us @ 32MHz
C2 = umul16x16(A, B) // 8us @ 32MHz
A = 65535
B = 65535
C1 = A * B // 89us @ 32MHz
C2 = umul16x16(A, B) // 8us @ 32MHz
while(true)
end while
-
Jerry Messina
- Swordfish Developer
- Posts: 1473
- Joined: Fri Jan 30, 2009 6:27 pm
- Location: US
Post
by Jerry Messina » Thu Sep 25, 2014 7:06 pm
And here's a follow on to the previous post that has both 16-bit signed (smul16x16) and unsigned routines (umul16x16)
Like before, using the hardware multiplier is 8-10x faster
Code: Select all
//
// umul16x16() - perform unsigned 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit result
// the method is taken from the examples found in the
// hardware multiplier section of the datasheet
//
function umul16x16(arg1 as word, arg2 as word) as longword
structure m32_t
r0 as byte
r1 as byte
r2 as byte
r3 as byte
w as word // extra temp space
// aliases
w0 as r0.asword
w1 as r2.asword
dw as r0.aslongword
r24 as r1.aslongword
end structure
dim d as m32_t
d.w0 = arg1.bytes(0) * arg2.bytes(0) // arg1L * arg2L
d.w1 = arg1.bytes(1) * arg2.bytes(1) // arg1H * arg2H
d.w = arg1.bytes(0) * arg2.bytes(1) // arg1L * arg2H
d.r24 = d.r24 + d.w // add cross products
d.w = arg1.bytes(1) * arg2.bytes(0) // arg1H * arg2L
d.r24 = d.r24 + d.w // add cross products
result = d.dw
end function
//
// smul16x16() - perform signed 16x16 multiply using the
// PIC18 8x8 hardware multiplier and return the 32-bit signed result
//
function smul16x16(arg1 as integer, arg2 as integer) as longint
structure m32_t
r0 as byte
r1 as byte
r2 as byte
r3 as byte
w as word // extra temp space
w0 as r0.asword
w1 as r2.asword
dw as r0.aslongword
li as r0.aslongint
r24 as r1.aslongword
end structure
dim d as m32_t
d.w0 = arg1.bytes(0) * arg2.bytes(0) // arg1L * arg2L
d.w1 = arg1.bytes(1) * arg2.bytes(1) // arg1H * arg2H
d.w = arg1.bytes(0) * arg2.bytes(1) // arg1L * arg2H
d.r24 = d.r24 + d.w // add cross products
d.w = arg1.bytes(1) * arg2.bytes(0) // arg1H * arg2L
d.r24 = d.r24 + d.w // add cross products
// check if arg2 is negative...
if (arg2.bits(15) = 1) then
d.w1 = d.w1 - word(arg1)
endif
// check if arg1 is negative...
if (arg1.bits(15) = 1) then
d.w1 = d.w1 - word(arg2)
endif
result = d.li
end function
//
// examples and test code
//
dim b1, b2 as byte // 8-bit unsigned
dim si1, si2 as shortint // 8-bit signed -128 to 127
dim w1, w2 as word // 16-bit unsigned
dim i1, i2 as integer // 16-bit signed -32768 to 32767
dim lw1, lw2 as longword // 32-bit unsigned
dim li1, li2 as longint // 32-bit signed -2147483648 to 2147483647
dim fail as boolean
dim t, t1, t2 as word // elap time
dim TMR0ON as T0CON.bits(7)
// check to make sure SF is using the hardware multiplier for 8-bit operations
// the functions above assume it does
// (look as the asm code for 'MULWF' instructions)
w1 = b1 * b2 // 8-bit unsigned
i1 = si1 * si2 // 8-bit signed
//
// unsigned examples
//
fail = false
w1 = 65535
w2 = 1
lw1 = w1 * w2 // sf method (~65us @ 32MHz)
lw2 = umul16x16(w1, w2) // fast method (8us @ 32MHz)
if (lw1 <> lw2) then // make sure results match
fail = true
endif
w1 = 65535
w2 = 65535
lw1 = w1 * w2 // sf method (~89us @ 32MHz)
lw2 = umul16x16(w1, w2) // fast method
if (lw1 <> lw2) then // make sure results match
fail = true
endif
//
// signed examples
//
fail = false
i1 = 32767
i2 = 32767
li1 = i1 * i2 // sf method
li2 = smul16x16(i1, i2) // fast method
if (li1 <> li2) then // make sure results match
fail = true
endif
i1 = 32767
i2 = -32768
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
fail = true
endif
i1 = -32768
i2 = 32767
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
fail = true
endif
i1 = -32768
i2 = -32768
li1 = i1 * i2
li2 = smul16x16(i1, i2)
if (li1 <> li2) then
fail = true
endif
{
//
// example code to count instruction cycles
// you can also use the simulator in MPLAB (which will give the time as well)
//
// set TMR0 to count instruction cycles
T0CON = %00001000
// just time the worst-case execution time (doesn't change much for hdw method)
w1 = 65535
w2 = 1
write_tmr(0)
TMR0ON = 1
lw1 = w1 * w2 // sf standard method
TMR0ON = 0
read_tmr(t1) // t1 contains number of instruction cycles
write_tmr(0)
TMR0ON = 1
lw2 = umul16x16(w1, w2) // fast method
TMR0ON = 0
read_tmr(t2) // t2 contains number of instruction cycles
t = 0
}
while(true)
end while