Junk Code Engines for Polymorphic Malware
Good morning! Today we’re going to take a very detailed look at how a junk code generator called ETG (Executable Trash Generator) works. It was created by Z0MBiE (29a) and is designed for 32-bit architecture.
In the second part of the blog, I’ll explain my own implementation, TrashFormer, which is based on the aforementioned tool but designed for 64-bit systems.
Global args
To begin, let’s introduce the global variables used to indicate preferences to the engine at the time of invocation. Regardless of the context from which it’s called, the key is to communicate our preferences to the engine regarding the buffer content (used registers, instructions, control flow…).
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
; ---------------------------------------------------------------------------
; Executable Trash Generator (ETG) Engine
; version 2.00
; (x) 2000 Z0MBiE, z0mbie.cjb.net
; ---------------------------------------------------------------------------
locals __
ETG_MOVRR equ 00000000000000000000000000000001b
ETG_MOVRC equ 00000000000000000000000000000010b
ETG_MOVSXZX equ 00000000000000000000000000000100b
ETG_XCHG equ 00000000000000000000000000001000b
ETG_LEA equ 00000000000000000000000000010000b
ETG_TTTRR equ 00000000000000000000000000100000b
ETG_TTTRC equ 00000000000000000000000001000000b
ETG_INCDEC equ 00000000000000000000000010000000b
ETG_NOTNEG equ 00000000000000000000000100000000b
ETG_TESTRR equ 00000000000000000000001000000000b
ETG_TESTRC equ 00000000000000000000010000000000b
ETG_IMUL equ 00000000000000000000100000000000b
ETG_SHIFT equ 00000000000000000001000000000000b
ETG_SHxD equ 00000000000000000010000000000000b
ETG_BSWAP equ 00000000000000000100000000000000b
ETG_XADD equ 00000000000000001000000000000000b
ETG_BSx equ 00000000000000010000000000000000b
ETG_BTx equ 00000000000000100000000000000000b
ETG_JMPS equ 00000000000001000000000000000000b
ETG_SEG equ 00000000000010000000000000000000b
ETG_REP equ 00000000000100000000000000000000b
ETG_ALL equ 00000000000111111111111111111111b
ETG_DEFAULT equ ETG_TTTRC ; used if no cmds specified
REG_EAX equ 00000001h
REG_ECX equ 00000002h
REG_EDX equ 00000004h
REG_EBX equ 00000008h
REG_ESP equ 00000010h
REG_EBP equ 00000020h
REG_ESI equ 00000040h
REG_EDI equ 00000080h
REG_ALL equ (not REG_ESP) and 255
REG_DEFAULT equ REG_EAX ; used if no regs specified
...
etg_engine
(entry point)
The full logic of the engine is as follows:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
...
etg_engine proc c
arg user_param ; user-parameter
arg cmdavail ; ETG_xxx
arg regsrcavail ; REG_xxx
arg regdstavail ; REG_xxx
arg osizeptr ; ptr to generated bufsize
arg ncmds ; max number of commands
arg bufsize ; max size of buffer
arg bufptr ; ptr to output buffer
arg user_rnd ; external subroutine: rnd
local w
local wx3
local src
local dst
local src2
local dst2
local srcx3
local dstx3
local r1x0
local r1x1
local r1x3
local r2x3
local r3x3
local dst32
local dst32x3
local src32
local src32x3
local any
local anyx3
local any2
pusha
mov edi, bufptr
cld
and regsrcavail, REG_ALL
jnz __1a
mov regsrcavail, REG_DEFAULT
__1a:
and regdstavail, REG_ALL
jnz __1b
mov regdstavail, REG_DEFAULT
__1b:
and cmdavail, ETG_ALL
jnz __2
mov cmdavail, ETG_DEFAULT
__2:
__cycle:
mov eax, edi ; calc curr bufsize
sub eax, bufptr
mov ecx, osizeptr ; store
mov [ecx], eax
add eax, 16 ; to avoid overflow
cmp eax, bufsize ; check max size
jae __cycle_exit
dec ncmds ; check # of cmds
jl __cycle_exit
call __gen1 ; generate one command
jmp __cycle
__cycle_exit:
popa
ret
...
As we can see, it’s divided into cycles that are repeated during each execution until the buffer is filled.
If we look at the example code (in C++) provided by Z0MBiE, we can observe how the arguments are passed to the function:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <io.h>
#pragma hdrstop
#include "..\..\ETG\CPP\etg.hpp"
#include "..\..\ETG\CPP\etg.cpp"
DWORD randseed = GetTickCount();
DWORD __cdecl my_random(DWORD,DWORD range)
{
return
range == 0 ? 0 : (randseed = randseed * 214013 + 2531011) % range;
}
void main()
{
BYTE buf[16384];
DWORD bufsize;
__emit__(0xcc);
void* etg_ptr = &etg_bin;
(*(etg_engine*)etg_ptr)
(0x12345678, // user_param
ETG_ALL, // cmdavail (ETG_XXX)
REG_ALL, // regsrcavail (REG_XXX)
REG_EAX|REG_EBX, // regdstavail (REG_XXX)
&bufsize, // osizeptr (generaetd bufsize)
1000, // ncmds (max nº of commands)
sizeof(buf), // bufsize (max size of buf)
buf, // bufptr (ptr to output buff)
my_random); // user_rnd (external subroutine random func)
buf[bufsize] = 0xC3;
void* bufptr = &buf;
(*(void(*)())bufptr) ();
}
It might not seem like it, but this code helped me solve a lot of doubts when trying to understand the engine.
For this execution we’re going to assume that the values passed to etg_engine
are those found in the program: ETG_ALL
and REG_ALL
, which basically allow the use of any register, whether source or destination, and any instruction.
Let’s start with the code:
1
2
3
4
5
6
...
pusha
mov edi, bufptr
cld
...
In the first lines, we push the registers eax
, ecx
, edx
, ebx
, esp
, ebp
, esi
, and edi
onto the stack using pusha
(note that this instruction doesn’t exist in 64-bit mode). Then, we move the argument value bufptr
into edi
and execute the cld
instruction which clears the direction flag (CLear Direction flag).
This is useful because when writing byte by byte using the stosb
instruction, edi
must point to the buffer that receives the data. Depending on the Direction Flag, the write will move forward or backward through memory.
There will also be fragments (when necessary) where I explain the state of the stack in routines that are repeated, like the following:
pusha (etg_engine
)
EAX pusha
ECX
EDX
EBX
ESP
EBP
ESI
EDI
The code that follows:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
...
and regsrcavail, REG_ALL
jnz __1a
mov regsrcavail, REG_DEFAULT
__1a:
and regdstavail, REG_ALL
jnz __1b
mov regdstavail, REG_DEFAULT
__1b:
and cmdavail, ETG_ALL
jnz __2
mov cmdavail, ETG_DEFAULT
__2:
__cycle:
mov eax, edi ; calc curr bufsize
...
We have a small value assignment because the function arguments specify which instructions to use and which registers are available. In this case, there are three main arguments: cmdavail
, which defines the available instructions for the execution flow (ETG_XXX flags) regsrcavail
, which defines the available source registers (REG_XXX flags) and regdstavail
, which defines the destination registers, meaning the ones that will receive data from the others. In this case, we assign it REG_EAX|REG_EBX
.
Basically, what the code does is check if those argument values are equal to 0. If they are, it assigns default values since this would indicate no input was provided. Otherwise, it keeps the existing values. The goal is to verify that the function arguments are correctly set as they determine the flow of execution.
So after these lines of code we would have:
regsrcavail
→ REG_ALL
regdstavail
→ REG_EAX|REG_EBX
cmdavail
→ ETG_ALL
__cycle
We now enter this subroutine with a clear understanding of the arguments. Let’s look at the full code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
__cycle:
mov eax, edi ; calc curr bufsize
sub eax, bufptr
mov ecx, osizeptr ; store
mov [ecx], eax
add eax, 16 ; to avoid overflow
cmp eax, bufsize ; check max size
jae __cycle_exit
dec ncmds ; check # of cmds
jl __cycle_exit
call __gen1 ; generate one command
jmp __cycle
__cycle_exit:
popa
ret
...
Now, since this is the first cycle, it might be confusing but what we’re doing in the first lines, mov eax, edi; sub eax, bufptr
, is calculating the difference in written bytes. Both are pointers to the buffer, but edi
(or after the move, eax
) holds the address just past the modified bytes. So if 20 bytes were written, it would now be bufptr + 20
, and the resulting value in eax
would be 20.
Next, with mov ecx, osizeptr; mov [ecx], eax
we update the number of bytes that have been modified.
Then we add 16 to eax
to ensure there’s enough room to avoid overflow and confirm that we’re still within the buffer limits. If not, we jump to __cycle_exit
:
1
2
3
4
5
...
add eax, 16 ; to avoid overflow
cmp eax, bufsize ; check max size
jae __cycle_exit
...
1
2
3
4
5
6
7
8
...
dec ncmds ; check # of cmds
jl __cycle_exit
call __gen1 ; generate one command
jmp __cycle
...
Now we subtract 1 from ncmds
(number of commands), and if ncmds
is less than 0, we jump to __cycle_exit
.
Finally, if all checks pass, we jump to __gen1
:
1
2
3
4
5
6
7
8
...
call __gen1 ; generate one command
jmp __cycle
__cycle_exit:
popa
ret
__gen1
The code would be as follows:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
__gen1: mov w, 1
mov wx3, 1 shl 3
call __getregdst_avail
mov dst32, eax
shl eax, 3
mov dst32x3, eax
call __getregsrc_avail
mov src32, eax
shl eax, 3
mov src32x3, eax
mov eax, regdstavail
and eax, regsrcavail
test eax, REG_EAX+REG_EBX+REG_ECX+REG_EDX
jz __32only
mov eax, 2
call __call_rnd
mov w, eax
shl eax, 3
mov wx3, eax
...
The first part of the code simply performs a few mov
instructions and calls __getregdst_avail
:
1
2
3
4
5
6
7
8
__gen1: mov w, 1 ; w = 1
mov wx3, 1 shl 3 ; wx3 = 8
call __getregdst_avail
mov dst32, eax
shl eax, 3
mov dst32x3, eax
...
Let’s take a look at what this function does.
__getregdst_avail
1
2
__getregdst_avail: mov edx, regdstavail
jmp __getreg_avail
As we can see, this function is a wrapper to load regdstavail
(in this case, REG_EAX|REG_EBX
) into edx
. Right after that, it jumps to __getreg_avail
.
__getreg_avail
1
2
3
4
5
6
7
8
__getreg_avail: call __getrnd8 ; get random available reg.
mov ecx, eax
cmp w, 0 ; w==0, 16bit?
jnz __32
and ecx, 3 ; 16bit# --> 32bit#
__32: bt edx, ecx
jnc __getreg_avail
retn
__getrnd8
As we can see, the first thing done is a call to __getrnd8
, a function that returns a number between 0 and 7 into eax
for later use.
1
2
3
__getrnd8: mov eax, 8
call __call_rnd
retn
What it does is move the value 8 into eax
and call __call_rnd
. This function uses the user-provided random function (user_rnd
) and its corresponding parameter (user_param
). Before the call, we push eax
(which is 8), since one of the arguments to the random function is the range for the random number.
1
2
3
4
5
6
7
8
9
__call_rnd: pusha
push eax
push user_param
call user_rnd
add esp, 8
mov [esp+7*4], eax
popa
or eax, eax
retn
As mentioned earlier, pusha
is used to save the register state. After the random number is generated, we clean up the stack removing the pushed value 8 and user_param
and then restore eax
with the random result, followed by a popa
to restore everything else, leaving the random number in eax
.
Before continuing, I’d like to show two things. First, how the exported function might look when defined in C++, the external user_rnd
which let us to get a completely random number:
1
2
3
4
5
6
7
DWORD randseed = GetTickCount();
DWORD __cdecl my_random(DWORD,DWORD range)
{
return
range == 0 ? 0 : (randseed = randseed * 214013 + 2531011) % range;
}
Also, let’s take a quick look at the stack during this execution flow:
- mov eax, 8 (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
) - pusha (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
) - push eax (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
) - push user_param (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
)
EAX pusha
ECX
EDX
EBX
ESP
EBP
ESI
EDI
EAX pusha
ECX
EDX
EBX
ESP
EBP
ESI
EDI
eax (8)
user_param
- add esp, 8 (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
) - mov
[esp+7*4]
, eax (etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
) - popa (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
) - eax = 0-7 (
etg_engine
->__cycle
->__gen1
->__getregdst_avail
->__getreg_avail
->__getrnd8
->call_rnd
)
EAX pusha
ECX
EDX
EBX
ESP
EBP
ESI
EDI
__getreg_avail
(after __getrnd8
)
When returning, we’re back in __getreg_avail
with a value between 0 and 7 in eax
:
1
2
3
4
5
6
7
8
__getreg_avail: call __getrnd8 ; get random available reg.
mov ecx, eax
cmp w, 0 ; w==0, 16bit?
jnz __32
and ecx, 3 ; 16bit# --> 32bit#
__32: bt edx, ecx
jnc __getreg_avail
retn
Next, we move this 0–7 value into ecx
, representing the register we’re going to use. We then check that the local variable w
is not zero (we’ll get to that later). If it is zero, we perform an and
with 3 to restrict the register choice, then we do a bit test (bt
) between ecx
and edx
. This works because edx
holds the bitmask of available registers, and ecx
represents the register index we’re trying to use. The bt
instruction checks if the corresponding bit is set, meaning the register is available. If not, the jnc
(jump if no carry) allows us to loop and try again with a new random register until we get one that’s valid.
__gen1
(after __getreg_avail
)
1
2
3
4
5
6
7
8
9
__gen1: mov w, 1
mov wx3, 1 shl 3
call __getregdst_avail
mov dst32, eax
shl eax, 3
mov dst32x3, eax
...
At this point, we set the local variable dst32
with the selected register and do the same for dst32x3
.
The next fragment of the function follows exactly the same logic:
1
2
3
4
5
6
...
call __getregsrc_avail
mov src32, eax
shl eax, 3
mov src32x3, eax
...
Continuing:
1
2
3
4
5
6
...
mov eax, regdstavail
and eax, regsrcavail
test eax, REG_EAX+REG_EBX+REG_ECX+REG_EDX
jz __32only
...
The first two lines filter out common source and destination registers, store the result in eax
, and then perform a test
to check whether any of them is eax
, ebx
, ecx
, or edx
. If so, execution continues, otherwise it jumps to __32only
since it means the registers involved would be edi
, esi
, ebp
, or esp
.
(We’ll cover __32only
later since it comes after the next section.)
1
2
3
4
5
6
7
...
mov eax, 2
call __call_rnd
mov w, eax
shl eax, 3
mov wx3, eax
...
This fragment ONLY executes if one of the available registers is eax
, ebx
, ecx
, or edx
.
Here, we get a random value of 0 or 1 (50% chance) in eax
, then move it to the variable w
, and then to its corresponding wx3
. After that, we proceed through the next few lines to eventually reach __32only
.
__32only
this is the full code of this label:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
__32only:
mov eax, 2
call __call_rnd
mov r1x0, eax
shl eax, 1
mov r1x1, eax
shl eax, 3-1
mov r1x3, eax
mov eax, 4
call __call_rnd
shl eax, 3
mov r2x3, eax
call __getrnd8
shl eax, 3
mov r3x3, eax
call __getregsrc_avail
mov src, eax
shl eax, 3
mov srcx3, eax
call __getregsrc_avail
mov src2, eax
call __getregdst_avail
mov dst, eax
shl eax, 3
mov dstx3, eax
call __getregdst_avail
mov dst2, eax
call __getregany_avail
mov any, eax
shl eax, 3
mov anyx3, eax
call __getregany_avail
mov any2, eax
...
Let’s go through it step by step to avoid any confusion:
1
2
3
4
5
6
7
8
9
10
11
__32only:
mov eax, 2
call __call_rnd
mov r1x0, eax
shl eax, 1
mov r1x1, eax
shl eax, 3-1
mov r1x3, eax
...
The first thing we do is (once again) get a random number between 0 and 1 (50% chance), then move it to r1x0
. That same value is then shifted left by 1 bit and stored in r1x1
and shifted left by 3 bits and stored in r1x3
.
Next block:
1
2
3
4
5
6
7
8
9
10
...
mov eax, 4
call __call_rnd
shl eax, 3
mov r2x3, eax
call __getrnd8
shl eax, 3
mov r3x3, eax
...
We generate a random number from 0 to 3, shift it left by 3 bits and store it in r2x3
. Then we get another random number from 0 to 7, shift it left by 3 bits as well and store it in r3x3
.
Moving on:
1
2
3
4
5
6
7
8
...
call __getregsrc_avail
mov src, eax
shl eax, 3
mov srcx3, eax
call __getregsrc_avail
mov src2, eax
...
First, we retrieve the available source registers and store them in src
. Then we shift eax
left by 3 bits and store it in srcx3
. Lastly we call __getregsrc_avail
again and store the result in src2
.
In the end, all the __getreg...
functions follow the same internal flow. Here’s the complete logic across all of them:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
__getregsrc_avail: mov edx, regsrcavail
jmp __getreg_avail
__getregdst_avail: mov edx, regdstavail
jmp __getreg_avail
__getregany_avail: mov edx, regsrcavail
or edx, regdstavail
jmp __getreg_avail
__getreg_avail: call __getrnd8 ; get random available reg.
mov ecx, eax
cmp w, 0 ; w==0, 16bit?
jnz __32
and ecx, 3 ; 16bit# --> 32bit#
__32: bt edx, ecx
jnc __getreg_avail
retn
Now let’s look at the final block from __32only
:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
...
call __getregdst_avail
mov dst, eax
shl eax, 3
mov dstx3, eax
call __getregdst_avail
mov dst2, eax
call __getregany_avail
mov any, eax
shl eax, 3
mov anyx3, eax
call __getregany_avail
mov any2, eax
This part is essentially the same as previously explained but using __getregdst_avail
and __getregany_avail
instead so there’s no big mystery here.
Now let’s move on to the next and arguably “most important” lable: __gen1_recycle
, which will act as the main loop the program iterates through.
__gen1_recycle
Here’s the code for this function (including macros):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__gen1_recycle: mov eax, 31
call __call_rnd
xchg esi, eax
inc esi
mov edx, cmdavail
mov eax, w
dxx macro x
ifnb <x>
dec esi
jz x
endif
endm
cxx macro a1,a2,a3,a4,a5,a6,a7,a8,a9
local t
shr edx, 1
jnc t
dxx a1
dxx a2
dxx a3
dxx a4
dxx a5
dxx a6
dxx a7
dxx a8
dxx a9
t: endm
cxx __a00,__a01 ;ETG_MOVRR
cxx __a02,__a03 ;ETG_MOVRC
cxx __a04 ;ETG_MOVSXZX
cxx __a06 ;ETG_XCHG
cxx __a07 ;ETG_LEA
cxx __a08,__a09 ;ETG_TTTRR
cxx __a10,__a11 ;ETG_TTTRC
cxx __a12,__a13 ;ETG_INCDEC
cxx __a14 ;ETG_NOTNEG
cxx __a17 ;ETG_TESTRR
cxx __a18 ;ETG_TESTRC
cxx __a19,__a20 ;ETG_IMUL
cxx __a21,__a22 ;ETG_SHIFT
cxx __a23,__a24 ;ETG_SHxD
cxx __a25 ;ETG_BSWAP
cxx __a26 ;ETG_XADD
cxx __a27 ;ETG_BSx
cxx __a28,__a29 ;ETG_BTx
cxx __a30 ;ETG_JMPS
cxx __a31,__a32 ;ETG_SEG
cxx __a33 ;ETG_REP
jmp __gen1_recycle
The beginning of the function:
1
2
3
4
5
6
7
8
__gen1_recycle: mov eax, 31
call __call_rnd
xchg esi, eax
inc esi
mov edx, cmdavail
mov eax, w
...
First, we get a random number from 0–30 and store it in eax
. Then we use xchg
to swap the contents of eax
(the random 0–30 value) and esi
. Why do we do this? We’ll see that shortly when we explain the macros. Next we increment esi
(so now it holds a value from 1 to 31) move the cmdavail
constant into edx
and finally move the w
variable into eax
.
Now let’s look at the macros, since they basically do the core work of the engine:
dxx
1
2
3
4
5
6
dxx macro x
ifnb <x>
dec esi
jz x
endif
endm
A key detail was setting esi
earlier. This macro takes a label as an argument, decrements esi
(remember, it was between 1–31 after the inc
), and if esi
becomes 0 it jumps to x
. In other words, after xchg esi, eax
and inc esi
, esi
holds a value from 1 to 31. So this macro only jumps if esi
was 1 (and becomes 0 after dec
). This becomes more meaningful when combined with the cxx
macro.
cxx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
cxx macro a1,a2,a3,a4,a5,a6,a7,a8,a9
local t
shr edx, 1
jnc t
dxx a1
dxx a2
dxx a3
dxx a4
dxx a5
dxx a6
dxx a7
dxx a8
dxx a9
t: endm
cxx __a00,__a01 ;ETG_MOVRR
cxx __a02,__a03 ;ETG_MOVRC
cxx __a04 ;ETG_MOVSXZX
...
This macro takes a label as an argument, defines a local label, then performs a shr edx, 1
. This works because edx
contains cmdavail
(the command mask). If the resulting bit (representing the command) is 0, that means the command is not available so we skip it and move on. But if it’s 1, then the command is allowed and we execute dxx
for the given label.
Next comes the permutation flow:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
t: endm
cxx __a00,__a01 ;ETG_MOVRR
cxx __a02,__a03 ;ETG_MOVRC
cxx __a04 ;ETG_MOVSXZX
cxx __a06 ;ETG_XCHG
cxx __a07 ;ETG_LEA
cxx __a08,__a09 ;ETG_TTTRR
cxx __a10,__a11 ;ETG_TTTRC
cxx __a12,__a13 ;ETG_INCDEC
cxx __a14 ;ETG_NOTNEG
cxx __a17 ;ETG_TESTRR
cxx __a18 ;ETG_TESTRC
cxx __a19,__a20 ;ETG_IMUL
cxx __a21,__a22 ;ETG_SHIFT
cxx __a23,__a24 ;ETG_SHxD
cxx __a25 ;ETG_BSWAP
cxx __a26 ;ETG_XADD
cxx __a27 ;ETG_BSx
cxx __a28,__a29 ;ETG_BTx
cxx __a30 ;ETG_JMPS
cxx __a31,__a32 ;ETG_SEG
cxx __a33 ;ETG_REP
jmp __gen1_recycle
Here we cycle through the macros and execute instruction blocks corresponding to commands enabled in the engine’s mask.
As a reminder, here are the global variables:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
locals __
ETG_MOVRR equ 00000000000000000000000000000001b
ETG_MOVRC equ 00000000000000000000000000000010b
ETG_MOVSXZX equ 00000000000000000000000000000100b
ETG_XCHG equ 00000000000000000000000000001000b
ETG_LEA equ 00000000000000000000000000010000b
ETG_TTTRR equ 00000000000000000000000000100000b
ETG_TTTRC equ 00000000000000000000000001000000b
ETG_INCDEC equ 00000000000000000000000010000000b
ETG_NOTNEG equ 00000000000000000000000100000000b
ETG_TESTRR equ 00000000000000000000001000000000b
ETG_TESTRC equ 00000000000000000000010000000000b
ETG_IMUL equ 00000000000000000000100000000000b
ETG_SHIFT equ 00000000000000000001000000000000b
ETG_SHxD equ 00000000000000000010000000000000b
ETG_BSWAP equ 00000000000000000100000000000000b
ETG_XADD equ 00000000000000001000000000000000b
ETG_BSx equ 00000000000000010000000000000000b
ETG_BTx equ 00000000000000100000000000000000b
ETG_JMPS equ 00000000000001000000000000000000b
ETG_SEG equ 00000000000010000000000000000000b
ETG_REP equ 00000000000100000000000000000000b
ETG_ALL equ 00000000000111111111111111111111b
ETG_DEFAULT equ ETG_TTTRC ; used if no cmds specified
REG_EAX equ 00000001h
REG_ECX equ 00000002h
REG_EDX equ 00000004h
REG_EBX equ 00000008h
REG_ESP equ 00000010h
REG_EBP equ 00000020h
REG_ESI equ 00000040h
REG_EDI equ 00000080h
REG_ALL equ (not REG_ESP) and 255
REG_DEFAULT equ REG_EAX ; used if no regs specified
Permutations
To wrap up, let’s briefly review the part of Z0MBiE’s implementation responsible for writing bytes, the functions that generate the actual junk code:
(It’s worth reiterating that it generates 32-bit instructions, as previously mentioned)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
__a00: or al, 88h
stosb
__11sd_retn: mov al, 0C0h
or eax, srcx3
__c3: or eax, dst
stosb
retn
__a01: or al, 8Ah
stosb
__11ds_retn: mov al, 0C0h
or eax, dstx3
__c2: or eax, src
stosb
retn
__a02: mov al, 0B0h
or eax, wx3
__c1: or eax, dst
stosb
jmp __dataw_retn
__a03: or al, 0C6h
stosb
mov al, 11000000b
jmp __c1
__a04: mov al, 0Fh
stosb
mov al, 10110110b
or eax, w
or eax, r1x3
stosb
__11d32s_retn: mov al, 11000000b
or eax, dst32x3
jmp __c2
__a05: or al, 10000110b
stosb
__11dd_retn: mov al, 11000000b
or eax, dstx3
or eax, dst2
stosb
retn
__a06: or al, 10000110b
stosb
jmp __11dd_retn
__a07: mov al, 8Dh
stosb
mov al, 05h
or eax, dst32x3
stosb
jmp __dword_retn
__a08: or al, 00000000b
or eax, r3x3
stosb
jmp __11sd_retn
__a09: or al, 00000010b
or eax, r3x3
stosb
jmp __11ds_retn
__a10: or al, 10000000b
stosb
mov al, 11000000b
or eax, r3x3
or eax, dst
stosb
jmp __dataw_retn
__a11: test regdstavail, REG_EAX
jz __gen1_recycle
or al, 00000100b
or eax, r3x3
stosb
jmp __dataw_retn
__a12: or al, 11111110b
stosb
mov al, 11000000b
or eax, r1x3
jmp __c3
__a13: mov al, 01000000b
or eax, r1x3
or eax, dst32
stosb
retn
__a14: or al, 11110110b
stosb
mov al, 11010000b
or eax, r1x3
jmp __c3
;__a15: or al, 00111000b
; stosb
; jmp __11xx_retn
;
;__a16: or al, 10000000b
; stosb
; mov al, 11111000b
; or eax, any
; stosb
; jmp __dataw_retn
__a17: or al, 10000100b
stosb
__11xx_retn: mov al, 11000000b
or eax, anyx3
or eax, any2
stosb
retn
__a18: or al, 11110110b
stosb
mov al, 11000000b
or eax, any
stosb
jmp __dataw_retn
__a19: mov al, 00001111b
stosb
mov al, 10101111b
stosb
__11d32s32_retn: mov al, 11000000b
or eax, dst32x3
or eax, src32
stosb
retn
__a20: mov al, 69h
stosb
call __11d32s32_retn
jmp __dword_retn
__a21: or al, 11010000b
or eax, r1x1
stosb
mov al, 11000000b
or eax, r3x3
or eax, dst
stosb
retn
__a22: or al, 11000000b
stosb
mov al, 11000000b
or eax, r3x3
or eax, dst
stosb
jmp __byte_retn
__a23: mov al, 00001111b
stosb
mov al, 10100100b
or eax, r1x3
stosb
mov al, 11000000b
call __11s32d32_retn
jmp __byte_retn
__11s32d32_retn: mov al, 11000000b
or eax, src32x3
__c4: or eax, dst32
stosb
retn
__a24: test regsrcavail, REG_ECX
jz __gen1_recycle
mov al, 00001111b
stosb
mov al, 10100101b
or eax, r1x3
stosb
jmp __11s32d32_retn
__a25: mov al, 00001111b
stosb
mov al, 11001000b
jmp __c4
__a26: mov al, 00001111b
stosb
mov al, 11000000b
or eax, w
stosb
jmp __11dd_retn
__a27: mov al, 00001111b
stosb
mov al, 10111100b
or eax, r1x0 ; r1x0
stosb
jmp __11d32s32_retn
__a28: mov al, 00001111b
stosb
mov al, 10111010b
stosb
mov al, 11100000b
or eax, r2x3
or eax, dst32
stosb
jmp __byte_retn
__a29: mov al, 00001111b
stosb
mov al, 10100011b
or eax, r2x3
stosb
jmp __11s32d32_retn
__a30: mov ax, 01EBh
stosw
mov eax, 256
call __call_rnd
stosb
retn
__a31: mov al, 00100110b
or eax, r2x3
stosb
retn
__a32: mov al, 64h
or eax, r1x0
stosb
retn
__a33: mov al, 0F2h
or eax, r1x0
stosb
retn
__dataw_retn: cmp w, 0
je __byte_retn
__dword_retn: call __word_retn
__word_retn: call __byte_retn
__byte_retn: mov eax, 256
call __call_rnd
stosb
retn
Now that we’ve thoroughly analyzed ETG, let’s take a look at my own 64-bit implementation of a junk code generator engine, designed as part of a polymorphic obfuscation approach.
TrashFormer Engine
That’s right, my own 64-bit implementation of ETG. A few key points to highlight:
- I built it based on what I believe works best when generating junk code. Code that’s potentially useful (to confuse reverse engineers) and that does not alter the original program flow. What does that mean? For instance, instructions like
xchg
are excluded because they can modify vital register values for execution. - The logic behind TrashFormer is entirely different from ETG. While ETG uses macro-based permutation, TrashFormer is a full cycle divided into three phases, responsible for placing the appropriate instructions, including filtering.
- As mentioned above,
xchg
and many other instructions are excluded. In fact, onlymov
,or
,xor
,cmp
, andlea
are supported. - The engine is designed to insert a few instructions between executions. The idea is to avoid drawing much attention while still creating maximum distraction.
- It is written 100% in assembly, specifically for MASM (Microsoft Assembler).
- It’s called from C++ for ease of integration and display.
- This is just the code generator. In future posts, we’ll cover how malware using this engine must align buffers and handle architecture-specific tasks if does the reprogramming at runtime.
- In the code, I deliberately avoid using the stack, keeping it clean (except for
push
andpop
). Everything is handled via variables and registers:rax
,rdx
,rdx
,r8
, andr9
.
With that said, we can now dive into the code explanation.
Theory
First, let’s understand the theory behind the functionality.
The logic is essentially based on 3 bytes (again, 64-bit):
- 3rd byte – register used: the register receiving the information, covering both rXX (
rax
,rcx
,rdx
) and rX (r8
,r9
,r10
,r11
) - 2nd byte – instruction to execute:
1 2 3 4 5 6
mov_cmd equ 0x8b xchg_cmd equ 0x87 or_cmd equ 0x0b xor_cmd equ 0x33 lea_cmd equ 0x8D cmp_cmd equ 0x3B
- 1st byte – data flow direction: there are four possible flows:
rxx_rxx
,rxx_rx
,rx_rxx
, andrx_rx
```- C0 11000000 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- C1 11000001 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- C2 11000010 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- C3 11000011 rxx_rx, rx_rx (0x49, 0x4d)
- C8 11001000 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- C9 11001001 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- CA 11001010 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- CB 11001011 rxx_rx, rx_rx (0x49, 0x4d)
- D0 11010000 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- D1 11010001 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- D2 11010010 rxx_rxx, rxx_rx, rx_rxx, rx_rx (0x48, 0x49, 0x4c, 0x4d)
- D3 11010011 rxx_rx, rx_rx (0x49, 0x4d)
- D8 11011000 rxx_rx, rx_rx (0x49, 0x4d)
- D9 11011001 rxx_rx, rx_rx (0x49, 0x4d)
- DA 11011010 rxx_rx, rx_rx (0x49, 0x4d)
- DB 11011011 rx_rx (0x4d) ```
This might seem confusing at first, so let’s look at the mov
instruction as an example:
mov
1st Opcode
0x48
1
2
3
4
5
6
7
8
9
.text:0000000140001018 mov rax, rax 48 8B C0
.text:000000014000101B mov rax, rcx 48 8B C1
.text:000000014000101E mov rax, rdx 48 8B C2
.text:0000000140001031 mov rcx, rax 48 8B C8
.text:0000000140001034 mov rcx, rcx 48 8B C9
.text:0000000140001037 mov rcx, rdx 48 8B CA
.text:000000014000104A mov rdx, rax 48 8B D0
.text:000000014000104D mov rdx, rcx 48 8B D1
.text:0000000140001050 mov rdx, rdx 48 8B D2
0x49
1
2
3
4
5
6
7
8
9
10
11
12
.text:0000000140001021 mov rax, r8 49 8B C0
.text:0000000140001024 mov rax, r9 49 8B C1
.text:0000000140001027 mov rax, r10 49 8B C2
.text:000000014000102A mov rax, r11 49 8B C3
.text:000000014000103A mov rcx, r8 49 8B C8
.text:000000014000103D mov rcx, r9 49 8B C9
.text:0000000140001040 mov rcx, r10 49 8B CA
.text:0000000140001043 mov rcx, r11 49 8B CB
.text:0000000140001053 mov rdx, r8 49 8B D0
.text:0000000140001056 mov rdx, r9 49 8B D1
.text:0000000140001059 mov rdx, r10 49 8B D2
.text:000000014000105C mov rdx, r11 49 8B D3
0x4C
1
2
3
4
5
6
7
8
9
10
11
12
.text:0000000140001063 mov r8, rax 4C 8B C0
.text:0000000140001066 mov r8, rcx 4C 8B C1
.text:0000000140001069 mov r8, rdx 4C 8B C2
.text:000000014000107C mov r9, rax 4C 8B C8
.text:000000014000107F mov r9, rcx 4C 8B C9
.text:0000000140001082 mov r9, rdx 4C 8B CA
.text:0000000140001095 mov r10, rax 4C 8B D0
.text:0000000140001098 mov r10, rcx 4C 8B D1
.text:000000014000109B mov r10, rdx 4C 8B D2
.text:00000001400010AE mov r11, rax 4C 8B D8
.text:00000001400010B1 mov r11, rcx 4C 8B D9
.text:00000001400010B4 mov r11, rdx 4C 8B DA
0x4D
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
.text:000000014000106C mov r8, r8 4D 8B C0
.text:000000014000106F mov r8, r9 4D 8B C1
.text:0000000140001072 mov r8, r10 4D 8B C2
.text:0000000140001075 mov r8, r11 4D 8B C3
.text:0000000140001085 mov r9, r8 4D 8B C8
.text:0000000140001088 mov r9, r9 4D 8B C9
.text:000000014000108B mov r9, r10 4D 8B CA
.text:000000014000108E mov r9, r11 4D 8B CB
.text:000000014000109E mov r10, r8 4D 8B D0
.text:00000001400010A1 mov r10, r9 4D 8B D1
.text:00000001400010A4 mov r10, r10 4D 8B D2
.text:00000001400010A7 mov r10, r11 4D 8B D3
.text:00000001400010B7 mov r11, r8 4D 8B D8
.text:00000001400010BA mov r11, r9 4D 8B D9
.text:00000001400010BD mov r11, r10 4D 8B DA
.text:00000001400010C0 mov r11, r11 4D 8B DB
0x89
(2 bytes -> 32bits)
1
2
3
4
5
6
7
8
9
.text:00000001400010E7 mov eax, eax 8B C0
.text:00000001400010E9 mov eax, ecx 8B C1
.text:00000001400010EB mov eax, edx 8B C2
.text:00000001400010F0 mov ecx, eax 8B C8
.text:00000001400010F2 mov ecx, ecx 8B C8
.text:00000001400010F4 mov ecx, edx 8B CA
.text:00000001400010F9 mov edx, eax 8B D0
.text:00000001400010FB mov edx, ecx 8B D1
.text:00000001400010FD mov edx, edx 8B D2
(I’ve also included the 32-bit version, although it’s not the target of this engine)
As we can see, using an instruction and register mask along with filtering allows us to generate a junk shellcode that does not affect the actual flow of the program.
Global Variables
Now that the theory is out of the way, let’s jump into the actual code.
We’ll start with the global variables used to filter the output. These are also used when calling the engine:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
.data
; mask: 4 bytes (r8)
; [x]xxx -> usable registers
; x[x]xx -> usable instructions
; xx[x]x -> source and destination registers
; xxx[x] -> InstructionNumber
; usable registers mask
reg_rax equ 00000001b ; 1
reg_rcx equ 00000010b ; 2
reg_rdx equ 00000100b ; 4
reg_r8 equ 00001000b ; 8
reg_r9 equ 00010000b ; 16
reg_r10 equ 00100000b ; 32
reg_r11 equ 01000000b ; 64
reg_any equ 01111111b ; 127
; instructions used
mov_cmd equ 00000001b ; 8bh
cmp_cmd equ 00000010b ; 3Bh
or_cmd equ 00000100b ; 0bh
xor_cmd equ 00001000b ; 33h
lea_cmd equ 00010000b ; 8Dh
any_cmd equ 11111111b ; FFh
; src dst registers
reg_rxx_rxx equ 00000001b ; 1
reg_rxx_rx equ 00000010b ; 2
reg_rx_rxx equ 00000100b ; 4
reg_rx_rx equ 00001000b ; 8
reg_any_any equ 11111111b ; 0xFF
; usable destination registers operation
ins_c0 equ 00001001b ; (rax, r8)
ins_c8 equ 00010010b ; (rcx, r9)
ins_d0 equ 00100100b ; (rdx, r10)
ins_d8 equ 01000000b ; (r11)
UserBuf qword 0h
UserSize dword 0h
usableReg byte 0h
usableIns byte 0h
srcDest byte 0h
instructionsNumber byte 0h
RegistersMask byte 0h
...
Here, we declare the important global variables and assign values to the instruction mask.
Now let’s get into the actual engine code.
TrashFormer Entrypoint
The first step is to check the variables passed from C++. These are all received in r8
as a mask and then transferred into the global variables.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
.code
TrashFormer proc public
push rcx
push rdx
push rdi
push r8
push r9
push r10
push r11
xor rax, rax ; rax = 0
or rcx, rcx ; check userbuf ptr
jz __error
mov UserBuf, rcx
mov rdi, rcx ; move userbuf ptr to `rdi` for `stosb` instruction
xor rcx, rcx
or edx, edx ; check userbuf size
jz __error
cmp edx, 16
jbe __error
sub edx, 16 ; avoid overflows
mov UserSize, edx
xor rdx, rdx
or r8d, r8d ; check mask flag
jz __error
or r8b, r8b ; check number of junk instructions
jz __error
mov instructionsNumber, r8b
shr r8d, 8
or r8b, r8b ; check usable srcdst flag
jz __error
mov srcDest, r8b
shr r8d, 8
or r8b, r8b ; check usable instructions flag
jz __error
mov usableIns, r8b
shr r8d, 8
or r8b, r8b ; check usable registers flag
jz __error
mov usableReg, r8b
shr r8d, 8
xor rax, rax
mov al, instructionsNumber
cmp eax, UserSize
jae __error
...
We check for invalid or unexpected values that could cause errors.
As mentioned, the global variables are passed from C++ like this (this code is how we call and execute the engine from C++):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <windows.h>
#include <stdio.h>
//
// cmp usablereg, 8
// jae __only_rxx
// usable registers mask
#define reg_rax 1 // 00000001b
#define reg_rcx 2 // 00000010b
#define reg_rdx 4 // 00000100b
#define reg_r8 8 // 00001000b
#define reg_r9 16 // 00010000b
#define reg_r10 32 // 00100000b
#define reg_r11 64 // 01000000b
#define reg_any 127 // 10000000b
// instructions used
#define mov_cmd 1
#define cmp_cmd 2
#define or_cmd 4
#define xor_cmd 8
#define lea_cmd 16
#define any_cmd 0xFF
// src dst registers
#define reg_rxx_rxx 1
#define reg_rxx_rx 2
#define reg_rx_rxx 4
#define reg_rx_rx 8
#define reg_any_any 0xff
extern "C" unsigned int TrashFormer(void* pFreeBuf, unsigned long sFreeBuf, unsigned long mask);
int main() {
BYTE pTrashBuffer[1000] = { 0 };
printf("\npTrashBuffer: %p\n", pTrashBuffer);
// call to the engine
// unsigned int result = TrashFormer(pTrashBuffer, 300, (((reg_any) << 24) | ((any_cmd) << 16) | ((reg_any_any) << 8) | (0xFF)));
// unsigned int result = TrashFormer(pTrashBuffer, 300, (/*usable registers*/ ((reg_r8) << 24) | /*instruction used*/((mov_cmd) << 16) | /*Instruction movement*/ ((reg_rx_rx) << 8) | /*Number of instructions*/ (0x10)));
unsigned int result = TrashFormer(pTrashBuffer, 300, (/*usable registers*/ ((reg_rax) << 24) | /*instruction used*/(((any_cmd) << 16) | /*Instruction movement*/ (reg_rxx_rxx | reg_rx_rxx) << 8) | /*Number of instructions*/ (0xff)));
printf("\nresult: %u\n", result);
// return 0;
printf("\n\n");
// print buffer on hex byte format
for (unsigned int i = 0; i < 300; i++) {
if (i % 8 == 0) {
printf("\n\t");
}
if (i + 2 > 300) {
printf("0x%0.2X\n\n", (BYTE*)pTrashBuffer[i]);
}
else {
printf("0x%0.2X, ", (BYTE*)pTrashBuffer[i]);
}
}
// make the buffer executable
DWORD OldProtection = 0;
VirtualProtect(pTrashBuffer, 1000, PAGE_EXECUTE_READWRITE, &OldProtection);
// execute the content on the buffer
(*(void(*)())(void*)pTrashBuffer) ();
return 0;
}
The engine returns 1234
if something went wrong, or 0
if everything completed successfully.
Here’s what the __error
and __exit
labels look like:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
__error:
mov rcx, r9
mov al, 00h
rep stosb
mov rax, 1234
__exit:
pop r11
pop r10
pop r9
pop r8
pop rdi
pop rdx
pop rcx
ret
TrashFormer endp
The goal is to exit as cleanly as possible.
Also here’s the __success
label used to exit with a return code of 0:
1
2
3
4
__Success:
xor rax, rax
jmp __exit
And the __ret_end
label, which writes a ret
instruction (0xc3
) to the end of the buffer, allowing clean return:
1
2
3
4
5
__ret_end:
xor rax, rax
mov al, 0c3h
stosb
jmp __success
It’s also worth noting the use of helper functions to return random numbers. I know they’re not perfectly optimized, and I didn’t need that many but it was the simplest way for me to call them without getting confused by register values.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
; randNumX: Get a random number between 0 and (X-1). The result is on rax
randNum219 proc
call randFunc
xor rdx, rdx
mov rcx, 219
div rcx
mov rax, rdx
ret
randNum219 endp
randNum64 proc
call randFunc
xor rdx, rdx
mov rcx, 64
div rcx
mov rax, rdx
ret
randNum64 endp
randNum8 proc
call randFunc
xor rdx, rdx
mov rcx, 8
div rcx
mov rax, rdx
ret
randNum8 endp
randNum7 proc
call randFunc
xor rdx, rdx
mov rcx, 7
div rcx
mov rax, rdx
ret
randNum7 endp
randNum6 proc
call randFunc
xor rdx, rdx
mov rcx, 6
div rcx
mov rax, rdx
ret
randNum6 endp
randNum5 proc
call randFunc
xor rdx, rdx
mov rcx, 5
div rcx
mov rax, rdx
ret
randNum5 endp
randNum4 proc
call randFunc
xor rdx, rdx
mov rcx, 4
div rcx
mov rax, rdx
ret
randNum4 endp
randNum3 proc
call randFunc
xor rdx, rdx
mov rcx, 3
div rcx
mov rax, rdx
ret
randNum3 endp
randNum2 proc
call randFunc
xor rdx, rdx
mov rcx, 2
div rcx
mov rax, rdx
ret
randNum2 endp
randFunc proc
rdtsc ; edx:eax counter
xor rax, rcx
ret
randFunc endp
end
__cycle
Entrypoint
Like Tarantino, we’re starting from the end.
This is the function that allows us to randomize the execution flow and control the bytes written into the buffer.
1
2
3
4
5
6
7
8
9
10
11
__cycle:
cmp instructionsNumber, 8
je __ret_end
dec instructionsNumber
sub UserSize, 3
cmp UserSize, 6
jbe __ret_end
add r9, 3
...
Basically what we do at the beginning of each cycle is update the global variables to keep control and avoid overflows or errors.
__movement_dispatcher
(1st byte lable)
This label is responsible for assigning the first byte which defines the data flow (rxx_rxx, rxx_rx, rx_rxx, rx_rx):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
__cycle:
cmp instructionsNumber, 8
je __ret_end
dec instructionsNumber
sub UserSize, 3
cmp UserSize, 6
jbe __ret_end
add r9, 3
__movement_dispatcher:
xor rax, rax
mov al, srcDest
bt rax, 7
jc __reg_any
__movement_cycle:
call randNum4
mov rcx, rax
mov al, srcDest
bt rax, rcx
jnc __movement_cycle
test usableReg, 111b
jz __only_rx
cmp rcx, 0 ;reg_rxx_rxx 01001000 -> 00000001
jz __rxx_rxx
cmp rcx, 1 ;reg_rxx_rx 01001001 -> 00000010
jz __rxx_rx
test usableReg, 1111000b
jz __movement_cycle
__only_rx:
cmp rcx, 2 ;reg_rx_rxx 01001100 -> 00000100
jz __rx_rxx
cmp rcx, 3 ;reg_rx_rx 01001101 -> 00001000
jz __rx_rx
jmp __movement_cycle
...
This dispatcher jumps to the corresponding labels based on the value passed and, well, luck:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
...
__reg_any:
xor rax, rax
call randNum4
cmp rax, 0
jz __rxx_rxx
cmp rax, 1
jz __rxx_rx
cmp rax, 2
jz __rx_rxx
cmp rax, 3
jz __rx_rx
jmp __instructionDispatcher
__rxx_rxx:
xor rax, rax
mov al, 48h
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __instructionDispatcher
__rxx_rx:
xor rax, rax
mov al, 49h
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __instructionDispatcher
__rx_rxx:
xor rax, rax
mov al, 4ch
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __instructionDispatcher
__rx_rx:
xor rax, rax
mov al, 4dh
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __instructionDispatcher
...
Here we’re writing the first byte into the buffer at rdi
(pTrashBuffer
), which as mentioned defines the data movement between registers.
__instructionDispatcher
(2nd byte lable)
This part uses the same logic as above. We write the second byte depending on the passed-in variables and some randomness:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
__instructionDispatcher:
mov r8b, al
xor rax, rax
mov al, usableIns
bt rax, 7
jc __any_cmd
__instructionCycle:
call randNum4
mov rcx, rax
mov al, usableIns
bt rax, rcx
jnc __instructionCycle
cmp rcx, 4
jz __lea_cmd
cmp rcx, 3
jz __xor_cmd
cmp rcx, 2
jz __or_cmd
cmp rcx, 1
jz __cmp_cmd
cmp rcx, 0
jz __mov_cmd
jmp __error
...
As mentioned before, the program is divided into three phases (except for lea
, which has only two). We write the first byte, proceed, write the second byte, proceed, write the third byte, then reset. The core logic is quite straightforward.
These are the relevant instruction labels:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
; random instruction (every instruction available)
__any_cmd:
xor rax, rax
call randNum5
cmp rax, 0
jz __mov_cmd
cmp rax, 1
jz __cmp_cmd
cmp rax, 2
jz __or_cmd
cmp rax, 3
jz __xor_cmd
cmp rax, 4
jz __lea_cmd
jmp __error
; mov_cmd equ 8bh
__mov_cmd:
mov ecx, eax
xor rax, rax
mov al, 8bh
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __dataDispatcher
; or_cmd equ 0bh
__or_cmd:
mov ecx, eax
xor rax, rax
mov al, 0bh
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __dataDispatcher
; xor_cmd equ 33h
__xor_cmd:
mov ecx, eax
xor rax, rax
mov al, 33h
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __dataDispatcher
; lea_cmd equ 8Dh
__lea_cmd:
cmp r8b, 48h
jne __instructionDispatcher
test usableReg, 111b
jz __instructionDispatcher
mov ecx, eax
xor rax, rax
mov al, 8dh
cld ; clear direction flag so DF = 0, going forward
stosb
__lea_rnd:
call randNum3
mov rcx, rax
mov al, usableReg
bt rax, rcx
jnc __lea_rnd
cmp rcx, 0
je __lea_rax
cmp rcx, 1
je __lea_rcx
cmp rcx, 2
je __lea_rdx
__lea_rax:
xor rax, rax
xor rcx, rcx
call randNum3
test rax, rax
jz __lea_rax
stosb
jmp __cycle
__lea_rcx:
xor rax, rax
xor rcx, rcx
mov cl, 08
call randNum2
test rax, rax
jz _off_lea
or cl, 2
_off_lea:
mov al, cl
stosb
jmp __cycle
__lea_rdx:
xor rax, rax
xor rcx, rcx
mov cl, 10h
call randNum2
or cl, al
mov al, cl
stosb
jmp __cycle
; cmp_cmd equ 3Bh
__cmp_cmd:
xor rax, rax
mov al, 3bh
cld ; clear direction flag so DF = 0, going forward
stosb
jmp __dataDispatcher
...
As you can see, we’re now writing the second byte into the buffer. It’s essentially the same process as before but let’s pause to focus on lea
.
lea
instruction
The thing about lea
is that it doesn’t follow the same byte logic as mov
, cmp
, or
, or xor
. lea
is irregular in how its bytes are structured:
0x48
1
2
3
4
5
6
.text:0000000140001018 lea rax, [rcx] 48 8D 01
.text:000000014000101B lea rax, [rdx] 48 8D 02
.text:000000014000101E lea rcx, [rax] 48 8D 08
.text:0000000140001021 lea rcx, [rdx] 48 8D 0A
.text:0000000140001024 lea rdx, [rax] 48 8D 10
.text:0000000140001027 lea rdx, [rcx] 48 8D 11
0x67
(2 bytes -> 32bits)
1
2
3
4
5
6
.text:0000000140001030 lea eax, [ecx] 67 8D 01
.text:0000000140001033 lea eax, [edx] 67 8D 02
.text:0000000140001036 lea ecx, [eax] 67 8D 08
.text:0000000140001039 lea ecx, [edx] 67 8D 0A
.text:000000014000103C lea edx, [eax] 67 8D 10
.text:000000014000103F lea edx, [ecx] 67 8D 11
As shown, it follows a completely different logic, which is why lea
only involves two steps, whereas other instructions take three, so there is an specific dispatch function for it.
__dataDispatcher
(3rd byte lable)
Same logic as the previous stages:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
__dataDispatcher:
; mov_cmd equ 00000001b ; 8bh
; cmp_cmd equ 00000010b ; 3Bh
; or_cmd equ 00000100b ; 0bh
; xor_cmd equ 00001000b ; 33h
; lea_cmd equ 00010000b ; 8Dh
; EXCEPTIONAL CASES:
; lea
call randNum7
mov rcx, rax
mov al, usableReg
bt rax, rcx
jnc __dataDispatcher
cmp rcx, 0
jz __rax_dst
cmp rcx, 1
jz __rcx_dst
cmp rcx, 2
jz __rdx_dst
cmp rcx, 3
jz __r8_dst
cmp rcx, 4
jz __r9_dst
cmp rcx, 5
jz __r10_dst
cmp rcx, 6
jz __r11_dst
...
Now we move on to the labels responsible for writing the third and final byte, which corresponds to the destination register:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
...
__rax_dst:
xor rax, rax
xor rcx, rcx
call randNum3
mov rcx, rax
mov al, 0c0h
or al, cl
stosb
jmp __cycle
__rcx_dst:
xor rax, rax
xor rcx, rcx
call randNum3
mov rcx, rax
mov al, 0c8h
or al, cl
stosb
jmp __cycle
__rdx_dst:
xor rax, rax
xor rcx, rcx
call randNum3
mov rcx, rax
mov al, 0d0h
or al, cl
stosb
jmp __cycle
__r8_dst:
xor rax, rax
xor rcx, rcx
call randNum4
mov rcx, rax
mov al, 0c0h
or al, cl
stosb
jmp __cycle
__r9_dst:
xor rax, rax
xor rcx, rcx
call randNum4
mov rcx, rax
mov al, 0c0h
or al, cl
stosb
jmp __cycle
__r10_dst:
xor rax, rax
xor rcx, rcx
call randNum3
mov rcx, rax
mov al, 0c0h
or al, cl
stosb
jmp __cycle
__r11_dst:
xor rax, rax
xor rcx, rcx
mov al, 0dbh
stosb
jmp __cycle
...
As you can see, each label jumps back to __cycle
at the end, starting the process again.
POC
Lets use TrashFormer with different parameters in each execution to see the results and how it generates the code.
1
2
3
...
unsigned int result = TrashFormer(pTrashBuffer, 300, (((reg_any) << 24) | ((any_cmd) << 16) | ((reg_any_any) << 8) | (0xFF)));
...
Here’s the buffer:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
0x49, 0x33, 0xDB, 0x48, 0x0B, 0xC2, 0x4C, 0x0B,
0xC2, 0x48, 0x8B, 0xC1, 0x4D, 0x0B, 0xC0, 0x48,
0x3B, 0xC1, 0x49, 0x8B, 0xC8, 0x48, 0x3B, 0xC2,
0x48, 0x3B, 0xC1, 0x48, 0x33, 0xC1, 0x49, 0x8B,
0xC0, 0x4C, 0x33, 0xCA, 0x49, 0x0B, 0xCA, 0x4D,
0x8B, 0xDB, 0x4D, 0x0B, 0xDB, 0x4D, 0x33, 0xC3,
0x48, 0x33, 0xC2, 0x48, 0x33, 0xC2, 0x4D, 0x33,
0xDB, 0x48, 0x0B, 0xC0, 0x4C, 0x3B, 0xCA, 0x48,
0x33, 0xC3, 0x4D, 0x33, 0xC2, 0x4D, 0x0B, 0xDB,
0x48, 0x8B, 0xCA, 0x48, 0x0B, 0xD0, 0x48, 0x8B,
0xC1, 0x4D, 0x0B, 0xC0, 0x48, 0x8B, 0xDB, 0x4D,
0x33, 0xC3, 0x4C, 0x3B, 0xC1, 0x49, 0x0B, 0xD0,
0x48, 0x8B, 0xCA, 0x49, 0x0B, 0xDB, 0x4C, 0x8B,
0xC2, 0x4D, 0x0B, 0xC1, 0x4C, 0x33, 0xC3, 0x4C,
0x0B, 0xC0, 0x48, 0x8B, 0xC0, 0x48, 0x8B, 0xC0,
0x4C, 0x3B, 0xC1, 0x49, 0x8B, 0xD1, 0x49, 0x0B,
0xC2, 0x48, 0x8B, 0xD0, 0x48, 0x8B, 0xC1, 0x49,
0x3B, 0xC8, 0x4C, 0x8B, 0xC0, 0x48, 0x3B, 0xC0,
0x48, 0x33, 0xD2, 0x4C, 0x3B, 0xD0, 0x4C, 0x3B,
0xD2, 0x49, 0x33, 0xC1, 0x4C, 0x8B, 0xD0, 0x48,
0x8D, 0x02, 0x49, 0x8B, 0xD0, 0x4D, 0x8B, 0xC2,
0x4C, 0x3B, 0xDB, 0x4C, 0x8B, 0xC2, 0x4C, 0x0B,
0xC3, 0x48, 0x0B, 0xD1, 0x48, 0x33, 0xC2, 0x4D,
0x0B, 0xC0, 0x49, 0x0B, 0xC1, 0x4C, 0x3B, 0xD1,
0x48, 0x8B, 0xC8, 0x48, 0x8D, 0x02, 0x4C, 0x33,
0xC1, 0x4D, 0x0B, 0xC1, 0x48, 0x8D, 0x02, 0x49,
0x3B, 0xDB, 0x4D, 0x3B, 0xC0, 0x48, 0x0B, 0xDB,
0x4C, 0x8B, 0xC1, 0x4C, 0x33, 0xD1, 0x4D, 0x8B,
0xC3, 0x49, 0x0B, 0xD0, 0x48, 0x8B, 0xCA, 0x4D,
0x8B, 0xC9, 0x4C, 0x33, 0xC2, 0x49, 0x0B, 0xC8,
0x49, 0x8B, 0xC8, 0x49, 0x8B, 0xC1, 0x49, 0x8B,
0xC1, 0x4D, 0x0B, 0xDB, 0x48, 0x0B, 0xC0, 0x4C,
0x0B, 0xD0, 0x49, 0x8B, 0xC1, 0x4C, 0x3B, 0xDB,
0x49, 0x8B, 0xDB, 0x4D, 0x8B, 0xDB, 0x49, 0x3B,
0xC9, 0x48, 0x0B, 0xCA, 0xC3, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00
Here’s the generated code:
1
2
3
4
5
6
7
4C 3B D2 cmp r10,rdx
49 33 C1 xor rax,r9
4C 8B D0 mov r10,rax
48 8D 02 lea rax,[rdx]
49 8B D0 mov rdx,r8
4D 8B C2 mov r8,r10
4C 3B DB cmp r11,rbx
Second exec:
1
2
3
...
unsigned int result = TrashFormer(pTrashBuffer, 300, (/*usable registers*/ ((reg_r8) << 24) | /*instruction used*/((mov_cmd) << 16) | /*Instruction movement*/ ((reg_rx_rx) << 8) | /*Number of instructions*/ (0x10)));
...
Hex:
1
2
3
4
5
0x4D, 0x8B, 0xC3, 0x4D, 0x8B, 0xC2, 0x4D, 0x8B,
0xC3, 0x4D, 0x8B, 0xC0, 0x4D, 0x8B, 0xC2, 0x4D,
0x8B, 0xC0, 0x4D, 0x8B, 0xC0, 0x4D, 0x8B, 0xC3,
0xC3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
...
Code:
1
2
3
4
5
6
7
8
9
4D 8B C3 mov r8,r11
4D 8B C2 mov r8,r10
4D 8B C3 mov r8,r11
4D 8B C0 mov r8,r8
4D 8B C2 mov r8,r10
4D 8B C0 mov r8,r8
4D 8B C0 mov r8,r8
4D 8B C3 mov r8,r11
C3 ret
Third and final execution:
1
2
3
...
unsigned int result = TrashFormer(pTrashBuffer, 300, (/*usable registers*/ ((reg_rax) << 24) | /*instruction used*/(((any_cmd) << 16) | /*Instruction movement*/ (reg_rxx_rxx | reg_rx_rxx) << 8) | /*Number of instructions*/ (0xff)));
...
Hex:
1
2
3
4
5
6
7
8
9
10
0x48, 0x8D, 0x02, 0x48, 0x0B, 0xC0, 0x48, 0x0B,
0xC0, 0x48, 0x8D, 0x01, 0x48, 0x8D, 0x02, 0x48,
0x3B, 0xC2, 0x48, 0x3B, 0xC0, 0x48, 0x8B, 0xC2,
0x48, 0x0B, 0xC1, 0x48, 0x8B, 0xC0, 0x48, 0x8D,
0x01, 0x48, 0x8D, 0x01, 0x48, 0x33, 0xC1, 0x48,
0x8B, 0xC2, 0x48, 0x0B, 0xC2, 0x48, 0x8B, 0xC1,
0x48, 0x8D, 0x01, 0x48, 0x8D, 0x01, 0x48, 0x8D,
0x01, 0x48, 0x0B, 0xC0, 0x48, 0x3B, 0xC0, 0x48,
0x8B, 0xC2, 0x48, 0x8D, 0x01, 0x48, 0x3B, 0xC0,
...
Code:
1
2
3
4
5
6
7
48 0B C1 or rax,rcx
48 8B C0 mov rax,rax
48 8D 01 lea rax,[rcx]
48 8D 01 lea rax,[rcx]
48 33 C1 xor rax,rcx
48 8B C2 mov rax,rdx
48 0B C2 or rax,rdx
Conclusion
You can check the full source code on my github repo TrashFormer
Good morning, and in case I don’t see ya: Good afternoon, good evening, and good night!