I think the smallest implementation in crypt is probably xor. So, I tried to investigate this part on a binary basis a little more.
TL;DR
--Since there is no xor.h for risc -v, I'm referring to a generic implementation.
――But is slow because it's a very simple implementation! </ del> It will be slower depending on the compiler! gcc9 may be reasonably fast!
――RISC-V has VECTOR instead of SIMD, so it may be necessary to consider it!
――But no one can consider it because there is no RISC-V board ...
$ riscv32-unknown-linux-gnu-gcc --version
riscv32-unknown-linux-gnu-gcc (GCC) 9.2.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ make ARCH=riscv CROSS_COMPILE=riscv32-unknown-linux-gnu- menuconfig
If you do CONFIG_BTRFS_FS = y
, you will get CONFIG_XOR_BLOCKS = y
.
Looking at the code for xor.c
, there is an include in xor.h
. But it's not under arch / riscv, right? I was impatient.
crypt/xor.c
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/raid/xor.h>
#include <linux/jiffies.h>
#include <linux/preempt.h>
#include <asm/xor.h>
If you include other architectures ... Yeah, you can see that there is no RISC-V version. I thought it was over here, but the wholesaler doesn't wholesale it.
Before compilation
$find arch -name "xor.h"
arch/alpha/include/asm/xor.h
arch/arm/include/asm/xor.h
arch/arm64/include/asm/xor.h
arch/ia64/include/asm/xor.h
arch/powerpc/include/asm/xor.h
arch/s390/include/asm/xor.h
arch/sparc/include/asm/xor.h
arch/um/include/asm/xor.h
arch/x86/include/asm/xor.h
Can't compile with this ...? If you think, you can connect it to the general-purpose xor.h
with the header automatically generated at compile time, and it will work without problems.
After compilation
$ find arch -name "xor.h"
arch/alpha/include/asm/xor.h
arch/arm/include/asm/xor.h
arch/arm64/include/asm/xor.h
arch/ia64/include/asm/xor.h
arch/powerpc/include/asm/xor.h
arch/riscv/include/generated/asm/xor.h
arch/s390/include/asm/xor.h
arch/sparc/include/asm/xor.h
arch/um/include/asm/xor.h
arch/x86/include/asm/xor.h
$ cat arch/riscv/include/generated/asm/xor.h
#include <asm-generic/xor.h>
$ find include -name "xor.h"
include/asm-generic/xor.h
Looking at the source code {ʻinclude / asm-genetic / xor.h`), it feels really simple ... faithful to the basics, sure.
include/asm-generic/xor.h
static void
xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
long lines = bytes / (sizeof (long)) / 8;
do {
p1[0] ^= p2[0];
p1[1] ^= p2[1];
p1[2] ^= p2[2];
p1[3] ^= p2[3];
p1[4] ^= p2[4];
p1[5] ^= p2[5];
p1[6] ^= p2[6];
p1[7] ^= p2[7];
p1 += 8;
p2 += 8;
} while (--lines > 0);
}
Alright, let's disassmble xor.o and see what's going on! !!
If you check what is defined as a symbol, you can observe the following.
--xor has 8regs and 32regs. --8 regs = 8 bit = unsigned char (previous example) - 32 regs = 32 bit = unsigned int --xor has a system that processes 2,3,4.5 at once.
xor.o symbol
$ riscv32-unknown-linux-gnu-objdump -t xor.o
xor.o: file format elf64-littleriscv
SYMBOL TABLE:
0000000000000000 l df *ABS* 0000000000000000 xor.c
0000000000000000 l d .text 0000000000000000 .text
0000000000000000 l d .data 0000000000000000 .data
0000000000000000 l d .bss 0000000000000000 .bss
0000000000000000 l d __ksymtab_strings 0000000000000000 __ksymtab_strings
0000000000000000 l __ksymtab_strings 0000000000000000 __kstrtab_xor_blocks
000000000000000b l __ksymtab_strings 0000000000000000 __kstrtabns_xor_blocks
0000000000000000 l F .text 0000000000000082 xor_8regs_2
0000000000000082 l F .text 00000000000000bc xor_8regs_3
000000000000013e l F .text 0000000000000104 xor_8regs_4
0000000000000242 l F .text 0000000000000168 xor_8regs_5
00000000000003aa l F .text 000000000000008c xor_32regs_2
0000000000000436 l F .text 00000000000000c0 xor_32regs_3
00000000000004f6 l F .text 0000000000000112 xor_32regs_4
0000000000000608 l F .text 0000000000000160 xor_32regs_5
0000000000000000 l O .sbss 0000000000000008 active_template
0000000000000000 l d .exit.text 0000000000000000 .exit.text
0000000000000000 l F .exit.text 000000000000000c xor_exit
0000000000000000 l d .rodata.str1.8 0000000000000000 .rodata.str1.8
0000000000000000 l d .init.text 0000000000000000 .init.text
0000000000000000 l F .init.text 00000000000000c6 do_xor_speed
00000000000000c6 l F .init.text 00000000000000f6 calibrate_xor_blocks
00000000000007d8 l F .text 000000000000015e xor_32regs_p_5
0000000000000936 l F .text 0000000000000112 xor_32regs_p_4
0000000000000a48 l F .text 00000000000000c4 xor_32regs_p_3
0000000000000b0c l F .text 0000000000000092 xor_32regs_p_2
0000000000000b9e l F .text 0000000000000150 xor_8regs_p_5
0000000000000cee l F .text 0000000000000108 xor_8regs_p_4
0000000000000df6 l F .text 00000000000000c0 xor_8regs_p_3
0000000000000eb6 l F .text 0000000000000088 xor_8regs_p_2
0000000000000000 l .data 0000000000000000 .LANCHOR1
0000000000000000 l O .data 0000000000000038 xor_block_8regs
0000000000000038 l O .data 0000000000000038 xor_block_8regs_p
0000000000000070 l O .data 0000000000000038 xor_block_32regs
00000000000000a8 l O .data 0000000000000038 xor_block_32regs_p
0000000000000000 l d .exitcall.exit 0000000000000000 .exitcall.exit
0000000000000000 l O .exitcall.exit 0000000000000008 __exitcall_xor_exit
0000000000000000 l d .init.data 0000000000000000 .init.data
<Omitted>
If you check the contents of the binary, it is xored and held in units of 8 bytes. And yet, it's very human readable (it's not unreadable). It's the result of unrolling and doing something ...
memo
a0 = bytes
a1 = *p1
a2 = *p2
xor.o asm
$ riscv32-unknown-linux-gnu-objdump -d xor.o
xor.o: file format elf64-littleriscv
Disassembly of section .text:
0000000000000000 <xor_8regs_2>:
0: 1141 addi sp,sp,-16 %Secure Stack
2: e422 sd s0,8(sp) %
4: 0800 addi s0,sp,16 %
6: 8119 srli a0,a0,0x6 % a0 = bytes >> 8
0000000000000008 <.L2>:
8: 00063803 ld a6,0(a2) % a6 = p2[0]
c: 6194 ld a3,0(a1) % a3 = p1[0]
e: 6598 ld a4,8(a1) % a4 = p1[8]
10: 699c ld a5,16(a1) % a5 = p1[16]
12: 0106c6b3 xor a3,a3,a6 % a3 = a3 ^ a6
16: e194 sd a3,0(a1) % p1[0] = a3
18: 6614 ld a3,8(a2) % a3 = p1[8]
1a: 0185b883 ld a7,24(a1) % a7 = p1[24]
1e: 0205b803 ld a6,32(a1) % a6 = p1[32]
22: 8f35 xor a4,a4,a3 % a4 = a4 ^ a3
24: e598 sd a4,8(a1) % p1[8] = a4
26: 01063303 ld t1,16(a2) % t1 = p2[16]
2a: 7594 ld a3,40(a1) % a3 = a1[40]
2c: 7998 ld a4,48(a1) % a4 = a1[48]
2e: 0067c7b3 xor a5,a5,t1 % a5 = a5 ^ t1
32: e99c sd a5,16(a1) % p1[16] = a5
34: 01863303 ld t1,24(a2) % t1 = p2[24]
38: 7d9c ld a5,56(a1) % a5 = p1[56]
3a: 04058593 addi a1,a1,64 % <a1 = a1 + 64>
3e: 0068c8b3 xor a7,a7,t1 % a7 = a7 ^ t1
42: fd15bc23 sd a7,-40(a1) % p1[-40] = a7 (Real 24)
46: 02063883 ld a7,32(a2) % a7 = p2[32]
4a: 04060613 addi a2,a2,64 % <a2 = a2 + 64>
4e: 157d addi a0,a0,-1 % <a0 = a0 - 1>
50: 01184833 xor a6,a6,a7 % a6 = a6 ^ a7
54: ff05b023 sd a6,-32(a1) % p1[-32] = a6(Real 32)
58: fe863803 ld a6,-24(a2) % a6 = p2[-24](Real40)
5c: 0106c6b3 xor a3,a3,a6 % a3 = a3 ^ a6
60: fed5b423 sd a3,-24(a1) % p1[-24] = a3(Real40)
64: ff063683 ld a3,-16(a2) % a3 = a2[-16](Real48)
68: 8f35 xor a4,a4,a3 % a4 = a4 ^ a3
6a: fee5b823 sd a4,-16(a1) % a1[-16] = a4(Real48)
6e: ff863703 ld a4,-8(a2) % a4 = a2[-8](Real56)
72: 8fb9 xor a5,a5,a4 % a5 = a5 ^ a4
74: fef5bc23 sd a5,-8(a1) % p1[-8] = a5 (Real56)
78: f8a048e3 bgtz a0,8 <.L2> % if(a0 > 0) goto L2
7c: 6422 ld s0,8(sp) %
7e: 0141 addi sp,sp,16 %Stack restore
80: 8082 ret
RISC-V has a Vector Extension. Not SIMD. Again, it's not SIMD.
The best thing is to read this fixstars article carefully ...
https://proc-cpuinfo.fixstars.com/2019/10/riscv-v Simply put, it feels like you can "leave what to process in parallel and how much in parallel, not at compile time".
It looks like this if you describe it only with delusions. Now, 64x8, 32x8, 32x4 should work as it should ...
loop:
vlsetvli t0, a0, e128,m8 %8 elements a0 times in 128-bit units. The number of times this time is in t0.
vle.v v1, (a1), vm % v1 = *a1
vle.v v2, (a2), vm % v2 = *a2
vor.vv v1, v1, v2, vm % v1 = v1 ^ v2
vse.v v1, (a1) % p1 = v1
addi a1, t0 % a1 = a1 - t0
addi a2, t0 % a2 = a2 - t0
sub a0, a0, t0 % a0 = a0 - t0
bnez a0, loop %repetition
that's all.