Skip to content

Commit

Permalink
riscv: __asm_copy_to-from_user: Optimize unaligned memory access and …
Browse files Browse the repository at this point in the history
…pipeline stall

This patch will reduce cpu usage dramatically in kernel space especially
for user apps which use sys-call with large buffer size, such as network
applications. The main reason behind this is that every unaligned memory
access will raise exceptions and switch between s-mode and m-mode causing
large overhead.

First copy in bytes until reaches the first word aligned boundary in
destination memory address. This is the preparation before the bulk
aligned word copy.

The destination address is aligned now, but oftentimes the source address
is not in an aligned boundary. To reduce the unaligned memory access, it
reads the data from source in aligned boundaries, which will cause the
data to have an offset, and then combines the data in the next iteration
by fixing offset with shifting before writing to destination. The majority
of the improving copy speed comes from this shift copy.

In the lucky situation that the both source and destination address are on
the aligned boundary, perform load and store with register size to copy the
data. Without the unrolling, it will reduce the speed since the next store
instruction for the same register using from the load will stall the
pipeline.

At last, copying the remainder in one byte at a time.

Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
  • Loading branch information
mcd500 committed Jul 6, 2021
1 parent c8ea293 commit 179f8f1
Showing 1 changed file with 146 additions and 35 deletions.
181 changes: 146 additions & 35 deletions arch/riscv/lib/uaccess.S
Original file line number Diff line number Diff line change
Expand Up @@ -19,50 +19,161 @@ ENTRY(__asm_copy_from_user)
li t6, SR_SUM
csrs CSR_STATUS, t6

add a3, a1, a2
/* Use word-oriented copy only if low-order bits match */
andi t0, a0, SZREG-1
andi t1, a1, SZREG-1
bne t0, t1, 2f
/* Save for return value */
mv t5, a2

addi t0, a1, SZREG-1
andi t1, a3, ~(SZREG-1)
andi t0, t0, ~(SZREG-1)
/*
* a3: terminal address of source region
* t0: lowest XLEN-aligned address in source
* t1: highest XLEN-aligned address in source
* Register allocation for code below:
* a0 - start of uncopied dst
* a1 - start of uncopied src
* a2 - size
* t0 - end of uncopied dst
*/
bgeu t0, t1, 2f
bltu a1, t0, 4f
add t0, a0, a2
bgtu a0, t0, 5f

/*
* Use byte copy only if too small.
*/
li a3, 8*SZREG /* size must be larger than size in word_copy */
bltu a2, a3, .Lbyte_copy_tail

/*
* Copy first bytes until dst is align to word boundary.
* a0 - start of dst
* t1 - start of aligned dst
*/
addi t1, a0, SZREG-1
andi t1, t1, ~(SZREG-1)
/* dst is already aligned, skip */
beq a0, t1, .Lskip_first_bytes
1:
fixup REG_L, t2, (a1), 10f
fixup REG_S, t2, (a0), 10f
addi a1, a1, SZREG
addi a0, a0, SZREG
bltu a1, t1, 1b
/* a5 - one byte for copying data */
fixup lb a5, 0(a1), 10f
addi a1, a1, 1 /* src */
fixup sb a5, 0(a0), 10f
addi a0, a0, 1 /* dst */
bltu a0, t1, 1b /* t1 - start of aligned dst */

.Lskip_first_bytes:
/*
* Now dst is aligned.
* Use shift-copy if src is misaligned.
* Use word-copy if both src and dst are aligned because
* can not use shift-copy which do not require shifting
*/
/* a1 - start of src */
andi a3, a1, SZREG-1
bnez a3, .Lshift_copy

.Lword_copy:
/*
* Both src and dst are aligned, unrolled word copy
*
* a0 - start of aligned dst
* a1 - start of aligned src
* a3 - a1 & mask:(SZREG-1)
* t0 - end of aligned dst
*/
addi t0, t0, -(8*SZREG-1) /* not to over run */
2:
bltu a1, a3, 5f
fixup REG_L a4, 0(a1), 10f
fixup REG_L a5, SZREG(a1), 10f
fixup REG_L a6, 2*SZREG(a1), 10f
fixup REG_L a7, 3*SZREG(a1), 10f
fixup REG_L t1, 4*SZREG(a1), 10f
fixup REG_L t2, 5*SZREG(a1), 10f
fixup REG_L t3, 6*SZREG(a1), 10f
fixup REG_L t4, 7*SZREG(a1), 10f
fixup REG_S a4, 0(a0), 10f
fixup REG_S a5, SZREG(a0), 10f
fixup REG_S a6, 2*SZREG(a0), 10f
fixup REG_S a7, 3*SZREG(a0), 10f
fixup REG_S t1, 4*SZREG(a0), 10f
fixup REG_S t2, 5*SZREG(a0), 10f
fixup REG_S t3, 6*SZREG(a0), 10f
fixup REG_S t4, 7*SZREG(a0), 10f
addi a0, a0, 8*SZREG
addi a1, a1, 8*SZREG
bltu a0, t0, 2b

addi t0, t0, 8*SZREG-1 /* revert to original value */
j .Lbyte_copy_tail

.Lshift_copy:

/*
* Word copy with shifting.
* For misaligned copy we still perform aligned word copy, but
* we need to use the value fetched from the previous iteration and
* do some shifts.
* This is safe because reading less than a word size.
*
* a0 - start of aligned dst
* a1 - start of src
* a3 - a1 & mask:(SZREG-1)
* t0 - end of uncopied dst
* t1 - end of aligned dst
*/
/* calculating aligned word boundary for dst */
andi t1, t0, ~(SZREG-1)
/* Converting unaligned src to aligned arc */
andi a1, a1, ~(SZREG-1)

/*
* Calculate shifts
* t3 - prev shift
* t4 - current shift
*/
slli t3, a3, LGREG
li a5, SZREG*8
sub t4, a5, t3

/* Load the first word to combine with seceond word */
fixup REG_L a5, 0(a1), 10f

3:
/* Main shifting copy
*
* a0 - start of aligned dst
* a1 - start of aligned src
* t1 - end of aligned dst
*/

/* At least one iteration will be executed */
srl a4, a5, t3
fixup REG_L a5, SZREG(a1), 10f
addi a1, a1, SZREG
sll a2, a5, t4
or a2, a2, a4
fixup REG_S a2, 0(a0), 10f
addi a0, a0, SZREG
bltu a0, t1, 3b

/* Revert src to original unaligned value */
add a1, a1, a3

.Lbyte_copy_tail:
/*
* Byte copy anything left.
*
* a0 - start of remaining dst
* a1 - start of remaining src
* t0 - end of remaining dst
*/
bgeu a0, t0, 5f
4:
fixup lb a5, 0(a1), 10f
addi a1, a1, 1 /* src */
fixup sb a5, 0(a0), 10f
addi a0, a0, 1 /* dst */
bltu a0, t0, 4b /* t0 - end of dst */

5:
/* Disable access to user memory */
csrc CSR_STATUS, t6
li a0, 0
li a0, 0
ret
4: /* Edge case: unalignment */
fixup lbu, t2, (a1), 10f
fixup sb, t2, (a0), 10f
addi a1, a1, 1
addi a0, a0, 1
bltu a1, t0, 4b
j 1b
5: /* Edge case: remainder */
fixup lbu, t2, (a1), 10f
fixup sb, t2, (a0), 10f
addi a1, a1, 1
addi a0, a0, 1
bltu a1, a3, 5b
j 3b
ENDPROC(__asm_copy_to_user)
ENDPROC(__asm_copy_from_user)
EXPORT_SYMBOL(__asm_copy_to_user)
Expand Down Expand Up @@ -117,7 +228,7 @@ EXPORT_SYMBOL(__clear_user)
10:
/* Disable access to user memory */
csrs CSR_STATUS, t6
mv a0, a2
mv a0, t5
ret
11:
csrs CSR_STATUS, t6
Expand Down

0 comments on commit 179f8f1

Please sign in to comment.