Copy-and-Patch: 튜토리얼

Copy-and-patch Compilation은 베이스라인 JIT[1]를 구성하는 매우 흥미로운 방법이다. 이 방법은 유지보수가 아주 쉬운 방식으로 코드 조각을 런타임에 믿을 수 없을 만큼 빠르게 컴파일할 수 있게 해주고, 어셈블리 코드를 실제로 이해해야 하는 정도가 거의 없으며, 전통적인 수작업 베이스라인 JIT와 비슷한 범위에 들어갈 만큼 충분히 좋은 품질의 네이티브 코드를 만들어낸다. [1]: 여기서 베이스라인 JIT란, 자체적으로 잘 최적화된 코드를 생성하기보다는 주로 코드를 빠르게 생성하고 인터프리팅 오버헤드를 제거함으로써 성능을 얻는 것을 목표로 하는 JIT를 의미한다. 베이스라인 JIT는 최적화 JIT와 짝을 이룰 수 있는데, 예를 들어 WASM을 위한 V8의 Liftoff 베이스라인 JIT는 V8의 Crankshaft 최적화 JIT로 티어 업(tiering up)할 수 있다.

Copy-and-patch는 _스텐실_을 작성하는 방식으로 동작한다. 스텐실은 원하는 개별 연산을 구현하는 최소한의 C 함수로, 컴파일되면 네이티브 코드 조각들을 이어 붙일 수 있도록 만들어진다. JIT 컴파일 시점에는 각 연산에 대해 미리 컴파일된 조각을 연속해서(back-to-back) 복사하고, 필요에 따라 내부에 박혀 있는 상수나 주소를 바꿔 끼우기(patch) 하면 된다..

Copy-and-patch가 어떻게 동작하는지 이해하기 위한 모험으로, 우리의 목표는 다음 함수를 만드는 것이다.

int add_a_b(int a, int b) {
    return a + b
}

하지만 이를 런타임에 1 + 2를 계산하도록 특화(specialize)할 것이다. 이를 위해 먼저 바이트코드 크기 정도의 연산들로 쪼개 보자:

const_int_reg1: a = 1;
const_int_reg2: b = 2;
add_int1_int2: c = a + b;
return_int1: return c;

그리고 copy-and-patch JIT를 정의하기 위해, 각 항목에 대해 다음을 수행한다:

나중에 패치할 재배치(relocation) 구멍을 포함하도록, 연산을 C로 구현해 스텐실을 만든다.
스텐실을 네이티브 코드로 컴파일한다.
네이티브 코드를 C 파일로 복사-붙여넣기 하고, 버퍼에 이를 방출(emit)하며 재배치를 패치하는 함수들을 작성한다.

그다음 간단한 JIT 컴파일 엔진을 작성해 스텐실을 이어 붙이고 생성된 함수를 실행하면 된다. 시작해 보자!

스텐실

첫 단계는 스텐실을 정의하는 것이다:

stencils.c

#include <stdint.h>

#define STENCIL_FUNCTION __attribute__((preserve_none))

extern char cnp_value_hole[65536];
extern void cnp_func_hole(void) STENCIL_FUNCTION;

#define STENCIL_HOLE(type) \
  (type)((uintptr_t)&cnp_value_hole)
#define DECLARE_STENCIL_OUTPUT(...) \
  typedef void(*stencil_output_fn)(__VA_ARGS__) STENCIL_FUNCTION; \
  stencil_output_fn stencil_output = (stencil_output_fn)&cnp_func_hole;

STENCIL_FUNCTION void load_int_reg1() {
  int a = STENCIL_HOLE(int);
  DECLARE_STENCIL_OUTPUT(int);
  stencil_output(a);
}

STENCIL_FUNCTION void load_int_reg2(int a) {
  int b = STENCIL_HOLE(int);
  DECLARE_STENCIL_OUTPUT(int, int);
  stencil_output(a, b);
}

STENCIL_FUNCTION void add_int1_int2(int a, int b) {
  int c = a + b;
  DECLARE_STENCIL_OUTPUT(int);
  stencil_output(c);
}

STENCIL_FUNCTION int return_int1(int a) {
  return a;
}

이를 clang -O3 -fno-pic -mcmodel=medium -c stencils.c로 컴파일하고, objdump -d -Mintel,x86-64 --disassemble --reloc stencils.o로 생성된 코드를 살펴보자. 결과는 다음과 같다:

0000000000000000 <load_int_reg1>:
   0:	41 bc 00 00 00 00    	mov    r12d,0x0
			2: R_X86_64_32	cnp_value_hole
   6:	e9 00 00 00 00       	jmp    b <load_int_reg1+0xb>
			7: R_X86_64_PLT32	cnp_func_hole-0x4
   b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]

0000000000000010 <load_int_reg2>:
  10:	41 bd 00 00 00 00    	mov    r13d,0x0
			12: R_X86_64_32	cnp_value_hole
  16:	e9 00 00 00 00       	jmp    1b <load_int_reg2+0xb>
			17: R_X86_64_PLT32	cnp_func_hole-0x4
  1b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]

0000000000000020 <add_int1_int2>:
  20:	45 01 ec             	add    r12d,r13d
  23:	e9 00 00 00 00       	jmp    28 <add_int1_int2+0x8>
			24: R_X86_64_PLT32	cnp_func_hole-0x4
  28:	0f 1f 84 00 00 00 00 	nop    DWORD PTR [rax+rax*1+0x0]
  2f:	00

0000000000000030 <return_int1>:
  30:	44 89 e0             	mov    eax,r12d
  33:	c3                   	ret

(NOP들은 실제로 함수의 일부가 아니라, 각 함수가 16바이트 정렬로 시작하도록 추가된 패딩이다.)

각 스텐실에 대해, JIT 시에 사용할 스텐실 생성 라이브러리를 만들기 위해 아래 템플릿을 채운다.

uint8_t cnp_stencil_<OP>_code[] = {
  // 함수의 시작부터 jmp까지의 바이트를 복사한다.
};

uint8_t* cnp_copy_<OP>(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_<OP>_code);
  memcpy(stencil_start, cnp_stencil_<OP>_code, stencil_size);
  return stencil_start + stencil_size;
}

// 스텐실에 재배치가 있다면 값을 채운다.
// 없다면 이 함수를 작성하지 않고 건너뛰면 된다.
void cnp_patch_<OP>(uint8_t* stencil_start, /* ... */ ) {
  memcpy(stencil_start + /*relocation_offset*/, &value, /* relocation_size */);
}

그럼 시작해 보자!

cnp_stencils.c

#include <stdint.h>

uint8_t cnp_stencil_load_int_reg1_code[] = {
   0x41, 0xbc, 0x00, 0x00, 0x00, 0x00, // mov r12d,0x0
};
uint8_t* cnp_copy_load_int_reg1(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_load_int_reg1_code);
  memcpy(stencil_start, cnp_stencil_load_int_reg1_code, stencil_size);
  return stencil_start + stencil_size;
}
void cnp_patch_load_int_reg1(uint8_t* stencil_start, int value) {
  // 2: R_X86_64_32 cnp_value_hole  ->  0x02 offset
  memcpy(stencil_start + 0x2, &value, sizeof(value));
}

uint8_t cnp_stencil_load_int_reg2_code[] = {
   0x41, 0xbd, 0x00, 0x00, 0x00, 0x00, // mov r13d,0x0
};
uint8_t* cnp_copy_load_int_reg2(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_load_int_reg2_code);
  memcpy(stencil_start, cnp_stencil_load_int_reg2_code, stencil_size);
  return stencil_start + stencil_size;
}
void cnp_patch_load_int_reg2(uint8_t* stencil_start, int value) {
  // 12: R_X86_64_32 cnp_value_hole  ->  0x12 - 0x10 base = 0x2
  memcpy(stencil_start + 0x2, &value, sizeof(value));
}

uint8_t cnp_stencil_add_int1_int2_code[] = {
  0x45, 0x01, 0xec, // add r12d,r13d
};
uint8_t* cnp_copy_add_int1_int2(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_add_int1_int2_code);
  memcpy(stencil_start, cnp_stencil_add_int1_int2_code, stencil_size);
  return stencil_start + stencil_size;
}
// No patching needed

uint8_t cnp_stencil_return_int1_code[] = {
  0x44, 0x89, 0xe0, // mov eax,r12d
  0xc3,             // ret
};
uint8_t* cnp_copy_return_int1(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_return_int1_code);
  memcpy(stencil_start, cnp_stencil_return_int1_code, stencil_size);
  return stencil_start + stencil_size;
}
// No patching needed

완전히 자동화된 설정에서는 이 모든 작업이 빌드 시스템의 일부로 수행된다. 스텐실 컴파일과 이를 copy 함수/patch 함수 라이브러리로 변환하는 과정은 make 실행의 일부로 진행된다.

첫 JIT 만들기

이제 스텐실 라이브러리가 준비됐으니, 코드 생성 함수들을 사용해 런타임에 특화된 덧셈기를 만들 수 있다:

cnp_jit.c

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>

//#include "cnp_stencils.h"
uint8_t* cnp_copy_load_int_reg1(uint8_t* stencil_start);
void cnp_patch_load_int_reg1(uint8_t* stencil_start, int value);
uint8_t* cnp_copy_load_int_reg2(uint8_t* stencil_start);
void cnp_patch_load_int_reg2(uint8_t* stencil_start, int value);
uint8_t* cnp_copy_add_int1_int2(uint8_t* stencil_start);
uint8_t* cnp_copy_return_int1(uint8_t* stencil_start);

typedef int(*jit_func)() __attribute__((preserve_none));

jit_func create_add_1_2() {
  // Most systems mark memory as non-executable by default
  // and mprotect() to set memory as executable needs
  // to be run against mmap-allocated memory.  We start
  // by allocating it as read/write, and then switch it
  // to write/execute once we're done writing the code.
  uint8_t* codedata = mmap(NULL, 256, PROT_READ | PROT_WRITE,
      MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
  assert (codedata != MAP_FAILED);
  jit_func ret = (jit_func)codedata;

  // Concatenate our program together, while saving the
  // locations that need to be patched.
  uint8_t* load_int_reg1_location = codedata;
  codedata = cnp_copy_load_int_reg1(codedata);
  uint8_t* load_int_reg2_location = codedata;
  codedata = cnp_copy_load_int_reg2(codedata);
  codedata = cnp_copy_add_int1_int2(codedata);
  codedata = cnp_copy_return_int1(codedata);

  // Overwrite the zero value placeholders with our intended
  // specialized values: 1 and 2.
  cnp_patch_load_int_reg1(load_int_reg1_location, 1);
  cnp_patch_load_int_reg2(load_int_reg2_location, 2);

  // Now that we're done writing, remove write access and
  // allow execution from this page instead.
  int rc = mprotect(ret, 256, PROT_READ | PROT_EXEC);
  if (rc) {
    perror("mprotect");
  }
  return ret;
}

int main() {
  jit_func add_1_2 = create_add_1_2();
  int result = add_1_2();
  printf("JIT'd 1 + 2 = %d\n", result);
  return 0;
}

이제 이를 컴파일하고 실행할 수 있다!

$ clang cnp_jit.c cnp_stencils.c -o cnp_jit $ ./cnp_jit JIT'd 1 + 2 = 3

clang이 실제 어셈블리 코드를 작성하는 어려운 일을 대신 해주고, 우리의 JIT 컴파일러는 단지 memcpy 호출들의 묶음일 뿐인데도, 런타임 코드 생성을 성공적으로 구축했다!

이 모든 마법이 어떻게/왜 맞물려 동작하는지에 대한 더 깊은 설명은 How It Works에서 계속된다.

함께 보기

A Worked Example of Copy-and-Patch Compilation

직접 해보기

재배치 구멍을 선언하기 쉽게 해주는 몇 가지 매크로를 제공하는 헤더는 다음과 같다:

cnp_stencils.h

#include <stdint.h>

#define STENCIL_FUNCTION __attribute__((preserve_none))

extern void cnp_stencil_output(void) STENCIL_FUNCTION;

#define STENCIL_HOLE32(ordinal, type) \
  (type)((uintptr_t)&cnp_small_value_hole_##ordinal)
#define STENCIL_HOLE64(ordinal, type) \
  (type)((uintptr_t)&cnp_large_value_hole_##ordinal)
#define STENCIL_FN_NEAR(ordinal, type) \
  (type)&cnp_near_func_hole_##ordinal
#define STENCIL_FN_FAR(ordinal, type) \
  ({ uint64_t _cnp_addr_as_int = (uint64_t)((uintptr_t)&cnp_far_func_hole_##ordinal); \
  asm volatile("" : "+r" (_cnp_addr_as_int) : : "memory"); \
  (type)_cnp_addr_as_int; })
#define DECLARE_STENCIL_OUTPUT(...) \
  typedef void(*stencil_output_fn)(__VA_ARGS__) STENCIL_FUNCTION; \
  stencil_output_fn stencil_output = (stencil_output_fn)&cnp_stencil_output;

#define DECLARE_EXTERN_HOLES(ordinal) \
extern char cnp_large_value_hole_##ordinal[100000]; \
extern char cnp_small_value_hole_##ordinal[8]; \
extern void cnp_near_func_hole_##ordinal(void) STENCIL_FUNCTION; \
extern char cnp_far_func_hole_##ordinal[100000];

(이 매크로들이 왜 이런 형태인지에 대한 자세한 내용이 궁금하다면, 시리즈의 다음 글을 참고하라!)

그다음 필요한 만큼 복잡한 스텐실을 선언할 수 있다:

complex_stencil.h

#include "cnp_stencils.h"

// Declare up to the maximum number of holes you need of one type
// in a function:
DECLARE_EXTERN_HOLES(1);
DECLARE_EXTERN_HOLES(2);

STENCIL_FUNCTION
void fused_multiply_add_sqrt_ifnotzero() {
  uint32_t a = STENCIL_HOLE32(1, uint32_t);
  uint32_t b = STENCIL_HOLE32(2, int32_t);
  uint64_t c = STENCIL_HOLE64(1, uint64_t);

  uint64_t fma = a * b + c;

  if (fma == 0) {
    void (*div_trap)(void) = STENCIL_FN_NEAR(1, void(*)(void));
    div_trap();
  }

  uint64_t (*sqrt)(uint64_t) = STENCIL_FN_FAR(1, uint64_t(*)(uint64_t));
  uint64_t result = sqrt(c);

  DECLARE_STENCIL_OUTPUT(uint64_t);
  stencil_output(result);
}

그리고 완전성을 위해 덧붙이면, 이는 다음과 같이 컴파일된다:

0000000000000000 <fused_multiply_add_sqrt_ifnotzero>: 0: 50 push rax 1: b8 00 00 00 00 mov eax,0x0 2: R_X86_64_32 cnp_small_value_hole_2 6: b9 00 00 00 00 mov ecx,0x0 7: R_X86_64_32 cnp_small_value_hole_1 b: 0f af c8 imul ecx,eax e: 48 b8 00 00 00 00 00 movabs rax,0x0 15: 00 00 00 10: R_X86_64_64 cnp_large_value_hole_1 18: 48 01 c8 add rax,rcx 1b: 75 05 jne 22 <fused_multiply_add_sqrt_ifnotzero+0x22> 1d: e8 00 00 00 00 call 22 <fused_multiply_add_sqrt_ifnotzero+0x22> 1e: R_X86_64_PLT32 cnp_near_func_hole_1-0x4 22: 48 b8 00 00 00 00 00 movabs rax,0x0 29: 00 00 00 24: R_X86_64_64 cnp_far_func_hole_1 2c: 48 bf 00 00 00 00 00 movabs rdi,0x0 33: 00 00 00 2e: R_X86_64_64 cnp_large_value_hole_1 36: ff d0 call rax 38: 49 89 c4 mov r12,rax 3b: 58 pop rax 3c: e9 00 00 00 00 jmp 41 <fused_multiply_add_sqrt_ifnotzero+0x41> 3d: R_X86_64_PLT32 cnp_stencil_output-0x4

Copy-and-patch가 어떻게 동작하는지 이해하기 위한 모험으로, 우리의 목표는 다음 함수를 만드는 것이다.

int add_a_b(int a, int b) {
    return a + b
}

하지만 이를 런타임에 1 + 2를 계산하도록 특화(specialize)할 것이다. 이를 위해 먼저 바이트코드 크기 정도의 연산들로 쪼개 보자:

const_int_reg1: a = 1;
const_int_reg2: b = 2;
add_int1_int2: c = a + b;
return_int1: return c;

그리고 copy-and-patch JIT를 정의하기 위해, 각 항목에 대해 다음을 수행한다:

나중에 패치할 재배치(relocation) 구멍을 포함하도록, 연산을 C로 구현해 스텐실을 만든다.
스텐실을 네이티브 코드로 컴파일한다.
네이티브 코드를 C 파일로 복사-붙여넣기 하고, 버퍼에 이를 방출(emit)하며 재배치를 패치하는 함수들을 작성한다.

그다음 간단한 JIT 컴파일 엔진을 작성해 스텐실을 이어 붙이고 생성된 함수를 실행하면 된다. 시작해 보자!

스텐실

첫 단계는 스텐실을 정의하는 것이다:

stencils.c

#include <stdint.h>

#define STENCIL_FUNCTION __attribute__((preserve_none))

extern char cnp_value_hole[65536];
extern void cnp_func_hole(void) STENCIL_FUNCTION;

#define STENCIL_HOLE(type) \
  (type)((uintptr_t)&cnp_value_hole)
#define DECLARE_STENCIL_OUTPUT(...) \
  typedef void(*stencil_output_fn)(__VA_ARGS__) STENCIL_FUNCTION; \
  stencil_output_fn stencil_output = (stencil_output_fn)&cnp_func_hole;

STENCIL_FUNCTION void load_int_reg1() {
  int a = STENCIL_HOLE(int);
  DECLARE_STENCIL_OUTPUT(int);
  stencil_output(a);
}

STENCIL_FUNCTION void load_int_reg2(int a) {
  int b = STENCIL_HOLE(int);
  DECLARE_STENCIL_OUTPUT(int, int);
  stencil_output(a, b);
}

STENCIL_FUNCTION void add_int1_int2(int a, int b) {
  int c = a + b;
  DECLARE_STENCIL_OUTPUT(int);
  stencil_output(c);
}

STENCIL_FUNCTION int return_int1(int a) {
  return a;
}

0000000000000000 <load_int_reg1>:
   0:	41 bc 00 00 00 00    	mov    r12d,0x0
			2: R_X86_64_32	cnp_value_hole
   6:	e9 00 00 00 00       	jmp    b <load_int_reg1+0xb>
			7: R_X86_64_PLT32	cnp_func_hole-0x4
   b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]

0000000000000010 <load_int_reg2>:
  10:	41 bd 00 00 00 00    	mov    r13d,0x0
			12: R_X86_64_32	cnp_value_hole
  16:	e9 00 00 00 00       	jmp    1b <load_int_reg2+0xb>
			17: R_X86_64_PLT32	cnp_func_hole-0x4
  1b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]

0000000000000020 <add_int1_int2>:
  20:	45 01 ec             	add    r12d,r13d
  23:	e9 00 00 00 00       	jmp    28 <add_int1_int2+0x8>
			24: R_X86_64_PLT32	cnp_func_hole-0x4
  28:	0f 1f 84 00 00 00 00 	nop    DWORD PTR [rax+rax*1+0x0]
  2f:	00

0000000000000030 <return_int1>:
  30:	44 89 e0             	mov    eax,r12d
  33:	c3                   	ret

(NOP들은 실제로 함수의 일부가 아니라, 각 함수가 16바이트 정렬로 시작하도록 추가된 패딩이다.)

각 스텐실에 대해, JIT 시에 사용할 스텐실 생성 라이브러리를 만들기 위해 아래 템플릿을 채운다.

uint8_t cnp_stencil_<OP>_code[] = {
  // 함수의 시작부터 jmp까지의 바이트를 복사한다.
};

uint8_t* cnp_copy_<OP>(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_<OP>_code);
  memcpy(stencil_start, cnp_stencil_<OP>_code, stencil_size);
  return stencil_start + stencil_size;
}

// 스텐실에 재배치가 있다면 값을 채운다.
// 없다면 이 함수를 작성하지 않고 건너뛰면 된다.
void cnp_patch_<OP>(uint8_t* stencil_start, /* ... */ ) {
  memcpy(stencil_start + /*relocation_offset*/, &value, /* relocation_size */);
}

그럼 시작해 보자!

cnp_stencils.c

#include <stdint.h>

uint8_t cnp_stencil_load_int_reg1_code[] = {
   0x41, 0xbc, 0x00, 0x00, 0x00, 0x00, // mov r12d,0x0
};
uint8_t* cnp_copy_load_int_reg1(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_load_int_reg1_code);
  memcpy(stencil_start, cnp_stencil_load_int_reg1_code, stencil_size);
  return stencil_start + stencil_size;
}
void cnp_patch_load_int_reg1(uint8_t* stencil_start, int value) {
  // 2: R_X86_64_32 cnp_value_hole  ->  0x02 offset
  memcpy(stencil_start + 0x2, &value, sizeof(value));
}

uint8_t cnp_stencil_load_int_reg2_code[] = {
   0x41, 0xbd, 0x00, 0x00, 0x00, 0x00, // mov r13d,0x0
};
uint8_t* cnp_copy_load_int_reg2(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_load_int_reg2_code);
  memcpy(stencil_start, cnp_stencil_load_int_reg2_code, stencil_size);
  return stencil_start + stencil_size;
}
void cnp_patch_load_int_reg2(uint8_t* stencil_start, int value) {
  // 12: R_X86_64_32 cnp_value_hole  ->  0x12 - 0x10 base = 0x2
  memcpy(stencil_start + 0x2, &value, sizeof(value));
}

uint8_t cnp_stencil_add_int1_int2_code[] = {
  0x45, 0x01, 0xec, // add r12d,r13d
};
uint8_t* cnp_copy_add_int1_int2(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_add_int1_int2_code);
  memcpy(stencil_start, cnp_stencil_add_int1_int2_code, stencil_size);
  return stencil_start + stencil_size;
}
// No patching needed

uint8_t cnp_stencil_return_int1_code[] = {
  0x44, 0x89, 0xe0, // mov eax,r12d
  0xc3,             // ret
};
uint8_t* cnp_copy_return_int1(uint8_t* stencil_start) {
  const size_t stencil_size = sizeof(cnp_stencil_return_int1_code);
  memcpy(stencil_start, cnp_stencil_return_int1_code, stencil_size);
  return stencil_start + stencil_size;
}
// No patching needed

첫 JIT 만들기

이제 스텐실 라이브러리가 준비됐으니, 코드 생성 함수들을 사용해 런타임에 특화된 덧셈기를 만들 수 있다:

cnp_jit.c

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>

//#include "cnp_stencils.h"
uint8_t* cnp_copy_load_int_reg1(uint8_t* stencil_start);
void cnp_patch_load_int_reg1(uint8_t* stencil_start, int value);
uint8_t* cnp_copy_load_int_reg2(uint8_t* stencil_start);
void cnp_patch_load_int_reg2(uint8_t* stencil_start, int value);
uint8_t* cnp_copy_add_int1_int2(uint8_t* stencil_start);
uint8_t* cnp_copy_return_int1(uint8_t* stencil_start);

typedef int(*jit_func)() __attribute__((preserve_none));

jit_func create_add_1_2() {
  // Most systems mark memory as non-executable by default
  // and mprotect() to set memory as executable needs
  // to be run against mmap-allocated memory.  We start
  // by allocating it as read/write, and then switch it
  // to write/execute once we're done writing the code.
  uint8_t* codedata = mmap(NULL, 256, PROT_READ | PROT_WRITE,
      MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
  assert (codedata != MAP_FAILED);
  jit_func ret = (jit_func)codedata;

  // Concatenate our program together, while saving the
  // locations that need to be patched.
  uint8_t* load_int_reg1_location = codedata;
  codedata = cnp_copy_load_int_reg1(codedata);
  uint8_t* load_int_reg2_location = codedata;
  codedata = cnp_copy_load_int_reg2(codedata);
  codedata = cnp_copy_add_int1_int2(codedata);
  codedata = cnp_copy_return_int1(codedata);

  // Overwrite the zero value placeholders with our intended
  // specialized values: 1 and 2.
  cnp_patch_load_int_reg1(load_int_reg1_location, 1);
  cnp_patch_load_int_reg2(load_int_reg2_location, 2);

  // Now that we're done writing, remove write access and
  // allow execution from this page instead.
  int rc = mprotect(ret, 256, PROT_READ | PROT_EXEC);
  if (rc) {
    perror("mprotect");
  }
  return ret;
}

int main() {
  jit_func add_1_2 = create_add_1_2();
  int result = add_1_2();
  printf("JIT'd 1 + 2 = %d\n", result);
  return 0;
}

이제 이를 컴파일하고 실행할 수 있다!

$ clang cnp_jit.c cnp_stencils.c -o cnp_jit $ ./cnp_jit JIT'd 1 + 2 = 3

이 모든 마법이 어떻게/왜 맞물려 동작하는지에 대한 더 깊은 설명은 How It Works에서 계속된다.

함께 보기

A Worked Example of Copy-and-Patch Compilation

직접 해보기

재배치 구멍을 선언하기 쉽게 해주는 몇 가지 매크로를 제공하는 헤더는 다음과 같다:

cnp_stencils.h

#include <stdint.h>

#define STENCIL_FUNCTION __attribute__((preserve_none))

extern void cnp_stencil_output(void) STENCIL_FUNCTION;

#define STENCIL_HOLE32(ordinal, type) \
  (type)((uintptr_t)&cnp_small_value_hole_##ordinal)
#define STENCIL_HOLE64(ordinal, type) \
  (type)((uintptr_t)&cnp_large_value_hole_##ordinal)
#define STENCIL_FN_NEAR(ordinal, type) \
  (type)&cnp_near_func_hole_##ordinal
#define STENCIL_FN_FAR(ordinal, type) \
  ({ uint64_t _cnp_addr_as_int = (uint64_t)((uintptr_t)&cnp_far_func_hole_##ordinal); \
  asm volatile("" : "+r" (_cnp_addr_as_int) : : "memory"); \
  (type)_cnp_addr_as_int; })
#define DECLARE_STENCIL_OUTPUT(...) \
  typedef void(*stencil_output_fn)(__VA_ARGS__) STENCIL_FUNCTION; \
  stencil_output_fn stencil_output = (stencil_output_fn)&cnp_stencil_output;

#define DECLARE_EXTERN_HOLES(ordinal) \
extern char cnp_large_value_hole_##ordinal[100000]; \
extern char cnp_small_value_hole_##ordinal[8]; \
extern void cnp_near_func_hole_##ordinal(void) STENCIL_FUNCTION; \
extern char cnp_far_func_hole_##ordinal[100000];

(이 매크로들이 왜 이런 형태인지에 대한 자세한 내용이 궁금하다면, 시리즈의 다음 글을 참고하라!)

그다음 필요한 만큼 복잡한 스텐실을 선언할 수 있다:

complex_stencil.h

#include "cnp_stencils.h"

// Declare up to the maximum number of holes you need of one type
// in a function:
DECLARE_EXTERN_HOLES(1);
DECLARE_EXTERN_HOLES(2);

STENCIL_FUNCTION
void fused_multiply_add_sqrt_ifnotzero() {
  uint32_t a = STENCIL_HOLE32(1, uint32_t);
  uint32_t b = STENCIL_HOLE32(2, int32_t);
  uint64_t c = STENCIL_HOLE64(1, uint64_t);

  uint64_t fma = a * b + c;

  if (fma == 0) {
    void (*div_trap)(void) = STENCIL_FN_NEAR(1, void(*)(void));
    div_trap();
  }

  uint64_t (*sqrt)(uint64_t) = STENCIL_FN_FAR(1, uint64_t(*)(uint64_t));
  uint64_t result = sqrt(c);

  DECLARE_STENCIL_OUTPUT(uint64_t);
  stencil_output(result);
}

그리고 완전성을 위해 덧붙이면, 이는 다음과 같이 컴파일된다:

스텐실

첫 JIT 만들기

함께 보기

직접 해보기

관련 추천 글

Copy-and-Patch: 작동 방식

엄마, 나 PostgreSQL용 새 JIT 컴파일러 만들었어 – Pinaraf의 웹사이트

기본적인 Just-In-Time 컴파일러

템플릿 인터프리터

스텐실

첫 JIT 만들기

함께 보기

직접 해보기

관련 추천 글

Copy-and-Patch: 작동 방식

엄마, 나 PostgreSQL용 새 JIT 컴파일러 만들었어 – Pinaraf의 웹사이트

기본적인 Just-In-Time 컴파일러

템플릿 인터프리터