/*
 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
 * Copyright (c) 2020, 2023, Huawei Technologies Co., Ltd. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#ifndef CPU_RISCV_ASSEMBLER_RISCV_HPP
#define CPU_RISCV_ASSEMBLER_RISCV_HPP

#include "asm/assembler.hpp"
#include "asm/register.hpp"
#include "code/codeCache.hpp"
#include "metaprogramming/enableIf.hpp"
#include "utilities/debug.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/macros.hpp"
#include <type_traits>

#define XLEN 64

// definitions of various symbolic names for machine registers

// First intercalls between C and Java which use 8 general registers
// and 8 floating registers

class Argument {
 public:
  enum {
    // check more info at https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc
    n_int_register_parameters_c   = 8,   // x10, x11, ... x17 (c_rarg0, c_rarg1, ...)
    n_float_register_parameters_c = 8,   // f10, f11, ... f17 (c_farg0, c_farg1, ... )
    n_vector_register_parameters_c = 16,  // v8, v9, ... v23

    n_int_register_parameters_j   = 8, // x11, ... x17, x10 (j_rarg0, j_rarg1, ...)
    n_float_register_parameters_j = 8  // f10, f11, ... f17 (j_farg0, j_farg1, ...)
  };
};

// function argument(caller-save registers)
constexpr Register c_rarg0 = x10;
constexpr Register c_rarg1 = x11;
constexpr Register c_rarg2 = x12;
constexpr Register c_rarg3 = x13;
constexpr Register c_rarg4 = x14;
constexpr Register c_rarg5 = x15;
constexpr Register c_rarg6 = x16;
constexpr Register c_rarg7 = x17;

constexpr FloatRegister c_farg0 = f10;
constexpr FloatRegister c_farg1 = f11;
constexpr FloatRegister c_farg2 = f12;
constexpr FloatRegister c_farg3 = f13;
constexpr FloatRegister c_farg4 = f14;
constexpr FloatRegister c_farg5 = f15;
constexpr FloatRegister c_farg6 = f16;
constexpr FloatRegister c_farg7 = f17;

// Symbolically name the register arguments used by the Java calling convention.
// We have control over the convention for java so we can do what we please.
// What pleases us is to offset the java calling convention so that when
// we call a suitable jni method the arguments are lined up and we don't
// have to do much shuffling. A suitable jni method is non-static and a
// small number of arguments.
//
// |------------------------------------------------------------------------|
// | c_rarg0  c_rarg1  c_rarg2  c_rarg3  c_rarg4  c_rarg5  c_rarg6  c_rarg7 |
// |------------------------------------------------------------------------|
// | x10      x11      x12      x13      x14      x15      x16      x17     |
// |------------------------------------------------------------------------|
// | j_rarg7  j_rarg0  j_rarg1  j_rarg2  j_rarg3  j_rarg4  j_rarg5  j_rarg6 |
// |------------------------------------------------------------------------|

constexpr Register j_rarg0 = c_rarg1;
constexpr Register j_rarg1 = c_rarg2;
constexpr Register j_rarg2 = c_rarg3;
constexpr Register j_rarg3 = c_rarg4;
constexpr Register j_rarg4 = c_rarg5;
constexpr Register j_rarg5 = c_rarg6;
constexpr Register j_rarg6 = c_rarg7;
constexpr Register j_rarg7 = c_rarg0;

// Java floating args are passed as per C

constexpr FloatRegister j_farg0 = f10;
constexpr FloatRegister j_farg1 = f11;
constexpr FloatRegister j_farg2 = f12;
constexpr FloatRegister j_farg3 = f13;
constexpr FloatRegister j_farg4 = f14;
constexpr FloatRegister j_farg5 = f15;
constexpr FloatRegister j_farg6 = f16;
constexpr FloatRegister j_farg7 = f17;

// zero rigster
constexpr Register zr = x0;
// global pointer
constexpr Register gp = x3;
// thread pointer
constexpr Register tp = x4;

// registers used to hold VM data either temporarily within a method
// or across method calls

// volatile (caller-save) registers

// current method -- must be in a call-clobbered register
constexpr Register xmethod =  x31;
// return address
constexpr Register ra      =  x1;

// non-volatile (callee-save) registers

constexpr Register sp            = x2; // stack pointer
constexpr Register fp            = x8; // frame pointer
constexpr Register xheapbase     = x27; // base of heap
constexpr Register xcpool        = x26; // constant pool cache
constexpr Register xmonitors     = x25; // monitors allocated on stack
constexpr Register xlocals       = x24; // locals on stack
constexpr Register xthread       = x23; // java thread pointer
constexpr Register xbcp          = x22; // bytecode pointer
constexpr Register xdispatch     = x21; // Dispatch table base
constexpr Register esp           = x20; // Java expression stack pointer
constexpr Register x19_sender_sp = x19; // Sender's SP while in interpreter

// temporary register(caller-save registers)
constexpr Register t0 = x5;
constexpr Register t1 = x6;
constexpr Register t2 = x7;
constexpr Register t3 = x28;
constexpr Register t4 = x29;
constexpr Register t5 = x30;
constexpr Register t6 = x31;

const Register g_INTArgReg[Argument::n_int_register_parameters_c] = {
  c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5, c_rarg6, c_rarg7
};

const FloatRegister g_FPArgReg[Argument::n_float_register_parameters_c] = {
  c_farg0, c_farg1, c_farg2, c_farg3, c_farg4, c_farg5, c_farg6, c_farg7
};

#define assert_cond(ARG1) assert(ARG1, #ARG1)

// Addressing modes
class Address {
 public:

  enum mode { no_mode, base_plus_offset, literal };

 private:
  struct Nonliteral {
    Nonliteral(Register base, Register index, int64_t offset)
      : _base(base), _index(index), _offset(offset) {}
    Register _base;
    Register _index;
    int64_t _offset;
  };

  struct Literal {
    Literal(address target, const RelocationHolder& rspec)
      : _target(target), _rspec(rspec) {}
    // If the target is far we'll need to load the ea of this to a
    // register to reach it. Otherwise if near we can do PC-relative
    // addressing.
    address _target;

    RelocationHolder _rspec;
  };

  void assert_is_nonliteral() const NOT_DEBUG_RETURN;
  void assert_is_literal() const NOT_DEBUG_RETURN;

  // Discriminated union, based on _mode.
  // - no_mode: uses dummy _nonliteral, for ease of copying.
  // - literal: only _literal is used.
  // - others: only _nonliteral is used.
  enum mode _mode;
  union {
    Nonliteral _nonliteral;
    Literal _literal;
  };

  // Helper for copy constructor and assignment operator.
  // Copy mode-relevant part of a into this.
  void copy_data(const Address& a) {
    assert(_mode == a._mode, "precondition");
    if (_mode == literal) {
      new (&_literal) Literal(a._literal);
    } else {
      // non-literal mode or no_mode.
      new (&_nonliteral) Nonliteral(a._nonliteral);
    }
  }

 public:
  // no_mode initializes _nonliteral for ease of copying.
  Address() :
    _mode(no_mode),
    _nonliteral(noreg, noreg, 0)
  {}

  Address(Register r) :
    _mode(base_plus_offset),
    _nonliteral(r, noreg, 0)
  {}

  template<typename T, ENABLE_IF(std::is_integral<T>::value)>
  Address(Register r, T o) :
    _mode(base_plus_offset),
    _nonliteral(r, noreg, o)
  {}

  Address(Register r, ByteSize disp) : Address(r, in_bytes(disp)) {}

  Address(address target, const RelocationHolder& rspec) :
    _mode(literal),
    _literal(target, rspec)
  {}

  Address(address target, relocInfo::relocType rtype = relocInfo::external_word_type);

  Address(const Address& a) : _mode(a._mode) { copy_data(a); }

  // Verify the value is trivially destructible regardless of mode, so our
  // destructor can also be trivial, and so our assignment operator doesn't
  // need to destruct the old value before copying over it.
  static_assert(std::is_trivially_destructible<Literal>::value, "must be");
  static_assert(std::is_trivially_destructible<Nonliteral>::value, "must be");

  Address& operator=(const Address& a) {
    _mode = a._mode;
    copy_data(a);
    return *this;
  }

  ~Address() = default;

  const Register base() const {
    assert_is_nonliteral();
    return _nonliteral._base;
  }

  long offset() const {
    assert_is_nonliteral();
    return _nonliteral._offset;
  }

  Register index() const {
    assert_is_nonliteral();
    return _nonliteral._index;
  }

  mode getMode() const {
    return _mode;
  }

  bool uses(Register reg) const {
    return _mode != literal && base() == reg;
  }

  address target() const {
    assert_is_literal();
    return _literal._target;
  }

  const RelocationHolder& rspec() const {
    assert_is_literal();
    return _literal._rspec;
  }
};

// Convenience classes
class RuntimeAddress: public Address {

  public:

  RuntimeAddress(address target) : Address(target, relocInfo::runtime_call_type) {}
  ~RuntimeAddress() {}
};

class OopAddress: public Address {

  public:

  OopAddress(address target) : Address(target, relocInfo::oop_type) {}
  ~OopAddress() {}
};

class ExternalAddress: public Address {
 private:
  static relocInfo::relocType reloc_for_target(address target) {
    // Sometimes ExternalAddress is used for values which aren't
    // exactly addresses, like the card table base.
    // external_word_type can't be used for values in the first page
    // so just skip the reloc in that case.
    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
  }

 public:

  ExternalAddress(address target) : Address(target, reloc_for_target(target)) {}
  ~ExternalAddress() {}
};

class InternalAddress: public Address {

  public:

  InternalAddress(address target) : Address(target, relocInfo::internal_word_type) {}
  ~InternalAddress() {}
};

class Assembler : public AbstractAssembler {
protected:

  static int zfa_zli_lookup_double(uint64_t value) {
    switch(value) {
      case 0xbff0000000000000 : return   0;
      case 0x0010000000000000 : return   1;
      case 0x3ef0000000000000 : return   2;
      case 0x3f00000000000000 : return   3;
      case 0x3f70000000000000 : return   4;
      case 0x3f80000000000000 : return   5;
      case 0x3fb0000000000000 : return   6;
      case 0x3fc0000000000000 : return   7;
      case 0x3fd0000000000000 : return   8;
      case 0x3fd4000000000000 : return   9;
      case 0x3fd8000000000000 : return  10;
      case 0x3fdc000000000000 : return  11;
      case 0x3fe0000000000000 : return  12;
      case 0x3fe4000000000000 : return  13;
      case 0x3fe8000000000000 : return  14;
      case 0x3fec000000000000 : return  15;
      case 0x3ff0000000000000 : return  16;
      case 0x3ff4000000000000 : return  17;
      case 0x3ff8000000000000 : return  18;
      case 0x3ffc000000000000 : return  19;
      case 0x4000000000000000 : return  20;
      case 0x4004000000000000 : return  21;
      case 0x4008000000000000 : return  22;
      case 0x4010000000000000 : return  23;
      case 0x4020000000000000 : return  24;
      case 0x4030000000000000 : return  25;
      case 0x4060000000000000 : return  26;
      case 0x4070000000000000 : return  27;
      case 0x40e0000000000000 : return  28;
      case 0x40f0000000000000 : return  29;
      case 0x7ff0000000000000 : return  30;
      case 0x7ff8000000000000 : return  31;
      default: break;
    }
    return -1;
  }

  static int zfa_zli_lookup_float(uint32_t value) {
    switch(value) {
      case 0xbf800000 : return  0;
      case 0x00800000 : return  1;
      case 0x37800000 : return  2;
      case 0x38000000 : return  3;
      case 0x3b800000 : return  4;
      case 0x3c000000 : return  5;
      case 0x3d800000 : return  6;
      case 0x3e000000 : return  7;
      case 0x3e800000 : return  8;
      case 0x3ea00000 : return  9;
      case 0x3ec00000 : return 10;
      case 0x3ee00000 : return 11;
      case 0x3f000000 : return 12;
      case 0x3f200000 : return 13;
      case 0x3f400000 : return 14;
      case 0x3f600000 : return 15;
      case 0x3f800000 : return 16;
      case 0x3fa00000 : return 17;
      case 0x3fc00000 : return 18;
      case 0x3fe00000 : return 19;
      case 0x40000000 : return 20;
      case 0x40200000 : return 21;
      case 0x40400000 : return 22;
      case 0x40800000 : return 23;
      case 0x41000000 : return 24;
      case 0x41800000 : return 25;
      case 0x43000000 : return 26;
      case 0x43800000 : return 27;
      case 0x47000000 : return 28;
      case 0x47800000 : return 29;
      case 0x7f800000 : return 30;
      case 0x7fc00000 : return 31;
      default: break;
    }
    return -1;
  }

  static int zfa_zli_lookup_half_float(uint16_t value) {
    switch(value) {
      case 0xbc00 : return  0;
      case 0x0400 : return  1;
      case 0x0100 : return  2;
      case 0x0200 : return  3;
      case 0x1c00 : return  4;
      case 0x2000 : return  5;
      case 0x2c00 : return  6;
      case 0x3000 : return  7;
      case 0x3400 : return  8;
      case 0x3500 : return  9;
      case 0x3600 : return 10;
      case 0x3700 : return 11;
      case 0x3800 : return 12;
      case 0x3900 : return 13;
      case 0x3a00 : return 14;
      case 0x3b00 : return 15;
      case 0x3c00 : return 16;
      case 0x3d00 : return 17;
      case 0x3e00 : return 18;
      case 0x3f00 : return 19;
      case 0x4000 : return 20;
      case 0x4100 : return 21;
      case 0x4200 : return 22;
      case 0x4400 : return 23;
      case 0x4800 : return 24;
      case 0x4c00 : return 25;
      case 0x5800 : return 26;
      case 0x5c00 : return 27;
      case 0x7800 : return 28;
      case 0x7c00 : return 29;
      // case 0x7c00 : return 30; // redundant with 29
      case 0x7e00 : return 31;
      default: break;
    }
    return -1;
  }

 public:

  static bool can_zfa_zli_half_float(jshort hf) {
    if (!UseZfa || !UseZfh) {
      return false;
    }
    uint16_t hf_bits = (uint16_t)hf;
    return zfa_zli_lookup_half_float(hf_bits) != -1;
  }

  static bool can_zfa_zli_float(jfloat f) {
    if (!UseZfa) {
      return false;
    }
    uint32_t f_bits = (uint32_t)jint_cast(f);
    return zfa_zli_lookup_float(f_bits) != -1;
  }

  static bool can_zfa_zli_double(jdouble d) {
    if (!UseZfa) {
      return false;
    }
    uint64_t d_bits = (uint64_t)julong_cast(d);
    return zfa_zli_lookup_double(d_bits) != -1;
  }

  enum {
    instruction_size = 4,
    compressed_instruction_size = 2,
  };

  // instruction must start at passed address
  static bool is_compressed_instr(address instr) {
    // The RISC-V ISA Manual, Section 'Base Instruction-Length Encoding':
    // Instructions are stored in memory as a sequence of 16-bit little-endian parcels, regardless of
    // memory system endianness. Parcels forming one instruction are stored at increasing halfword
    // addresses, with the lowest-addressed parcel holding the lowest-numbered bits in the instruction
    // specification.
    if (UseRVC && (((uint16_t *)instr)[0] & 0b11) != 0b11) {
      // 16-bit instructions have their lowest two bits equal to 0b00, 0b01, or 0b10
      return true;
    }
    // 32-bit instructions have their lowest two bits set to 0b11
    return false;
  }

  //---<  calculate length of instruction  >---
  // We just use the values set above.
  // instruction must start at passed address
  static unsigned int instr_len(address instr) {
    return is_compressed_instr(instr) ? compressed_instruction_size : instruction_size;
  }

  //---<  longest instructions  >---
  static unsigned int instr_maxlen() { return instruction_size; }

  enum RoundingMode {
    rne = 0b000,     // round to Nearest, ties to Even
    rtz = 0b001,     // round towards Zero
    rdn = 0b010,     // round Down (towards eegative infinity)
    rup = 0b011,     // round Up (towards infinity)
    rmm = 0b100,     // round to Nearest, ties to Max Magnitude
    rdy = 0b111,     // in instruction's rm field, selects dynamic rounding mode.In Rounding Mode register, Invalid.
  };

  // handle unaligned access
  static inline uint16_t ld_c_instr(address addr) {
    return Bytes::get_native_u2(addr);
  }
  static inline void sd_c_instr(address addr, uint16_t c_instr) {
    Bytes::put_native_u2(addr, c_instr);
  }

  // handle unaligned access
  static inline uint32_t ld_instr(address addr) {
    return Bytes::get_native_u4(addr);
  }
  static inline void sd_instr(address addr, uint32_t instr) {
    Bytes::put_native_u4(addr, instr);
  }

  static inline uint32_t extract(uint32_t val, unsigned msb, unsigned lsb) {
    assert_cond(msb >= lsb && msb <= 31);
    unsigned nbits = msb - lsb + 1;
    uint32_t mask = (1U << nbits) - 1;
    uint32_t result = val >> lsb;
    result &= mask;
    return result;
  }

  static inline int32_t sextract(uint32_t val, unsigned msb, unsigned lsb) {
    assert_cond(msb >= lsb && msb <= 31);
    int32_t result = val << (31 - msb);
    result >>= (31 - msb + lsb);
    return result;
  }

  static void patch(address a, unsigned msb, unsigned lsb, unsigned val) {
    assert_cond(a != nullptr);
    assert_cond(msb >= lsb && msb <= 31);
    unsigned nbits = msb - lsb + 1;
    guarantee(val < (1U << nbits), "Field too big for insn");
    unsigned mask = (1U << nbits) - 1;
    val <<= lsb;
    mask <<= lsb;
    unsigned target = ld_instr(a);
    target &= ~mask;
    target |= val;
    sd_instr(a, target);
  }

  static void patch(address a, unsigned bit, unsigned val) {
    patch(a, bit, bit, val);
  }

  static void patch_reg(address a, unsigned lsb, Register reg) {
    patch(a, lsb + 4, lsb, reg->raw_encoding());
  }

  static void patch_reg(address a, unsigned lsb, FloatRegister reg) {
    patch(a, lsb + 4, lsb, reg->raw_encoding());
  }

  static void patch_reg(address a, unsigned lsb, VectorRegister reg) {
    patch(a, lsb + 4, lsb, reg->raw_encoding());
  }

  void emit(unsigned insn) {
    emit_int32((jint)insn);
  }

  enum csr {
    cycle = 0xc00,
    time,
    instret,
    hpmcounter3,
    hpmcounter4,
    hpmcounter5,
    hpmcounter6,
    hpmcounter7,
    hpmcounter8,
    hpmcounter9,
    hpmcounter10,
    hpmcounter11,
    hpmcounter12,
    hpmcounter13,
    hpmcounter14,
    hpmcounter15,
    hpmcounter16,
    hpmcounter17,
    hpmcounter18,
    hpmcounter19,
    hpmcounter20,
    hpmcounter21,
    hpmcounter22,
    hpmcounter23,
    hpmcounter24,
    hpmcounter25,
    hpmcounter26,
    hpmcounter27,
    hpmcounter28,
    hpmcounter29,
    hpmcounter30,
    hpmcounter31 = 0xc1f
  };

  // Emit an illegal instruction that's known to trap, with 32 read-only CSR
  // to choose as the input operand.
  // According to the RISC-V Assembly Programmer's Manual, a de facto implementation
  // of this instruction is the UNIMP pseduo-instruction, 'CSRRW x0, cycle, x0',
  // attempting to write zero to a read-only CSR 'cycle' (0xC00).
  // RISC-V ISAs provide a set of up to 32 read-only CSR registers 0xC00-0xC1F,
  // and an attempt to write into any read-only CSR (whether it exists or not)
  // will generate an illegal instruction exception.
  void illegal_instruction(csr csr_reg) {
    csrrw(x0, (unsigned)csr_reg, x0);
  }

// Register Instruction
#define INSN(NAME, op, funct3, funct7)                          \
  void NAME(Register Rd, Register Rs1, Register Rs2) {          \
    unsigned insn = 0;                                          \
    patch((address)&insn, 6,  0, op);                           \
    patch((address)&insn, 14, 12, funct3);                      \
    patch((address)&insn, 31, 25, funct7);                      \
    patch_reg((address)&insn, 7, Rd);                           \
    patch_reg((address)&insn, 15, Rs1);                         \
    patch_reg((address)&insn, 20, Rs2);                         \
    emit(insn);                                                 \
  }

  INSN(_add,  0b0110011, 0b000, 0b0000000);
  INSN(_sub,  0b0110011, 0b000, 0b0100000);
  INSN(_andr, 0b0110011, 0b111, 0b0000000);
  INSN(_orr,  0b0110011, 0b110, 0b0000000);
  INSN(_xorr, 0b0110011, 0b100, 0b0000000);
  INSN(sll,   0b0110011, 0b001, 0b0000000);
  INSN(sra,   0b0110011, 0b101, 0b0100000);
  INSN(srl,   0b0110011, 0b101, 0b0000000);
  INSN(slt,   0b0110011, 0b010, 0b0000000);
  INSN(sltu,  0b0110011, 0b011, 0b0000000);
  INSN(_addw, 0b0111011, 0b000, 0b0000000);
  INSN(_subw, 0b0111011, 0b000, 0b0100000);
  INSN(sllw,  0b0111011, 0b001, 0b0000000);
  INSN(sraw,  0b0111011, 0b101, 0b0100000);
  INSN(srlw,  0b0111011, 0b101, 0b0000000);
  INSN(_mul,  0b0110011, 0b000, 0b0000001);
  INSN(mulh,  0b0110011, 0b001, 0b0000001);
  INSN(mulhsu,0b0110011, 0b010, 0b0000001);
  INSN(mulhu, 0b0110011, 0b011, 0b0000001);
  INSN(mulw,  0b0111011, 0b000, 0b0000001);
  INSN(div,   0b0110011, 0b100, 0b0000001);
  INSN(divu,  0b0110011, 0b101, 0b0000001);
  INSN(divw,  0b0111011, 0b100, 0b0000001);
  INSN(divuw, 0b0111011, 0b101, 0b0000001);
  INSN(rem,   0b0110011, 0b110, 0b0000001);
  INSN(remu,  0b0110011, 0b111, 0b0000001);
  INSN(remw,  0b0111011, 0b110, 0b0000001);
  INSN(remuw, 0b0111011, 0b111, 0b0000001);

#undef INSN

 private:
  // Load
  enum LoadWidthFunct3 : uint8_t {
    LOAD_WIDTH_BYTE              = 0b000,
    LOAD_WIDTH_HALFWORD          = 0b001,
    LOAD_WIDTH_WORD              = 0b010,
    LOAD_WIDTH_DOUBLEWORD        = 0b011,
    LOAD_WIDTH_BYTE_UNSIGNED     = 0b100,
    LOAD_WIDTH_HALFWORD_UNSIGNED = 0b101,
    LOAD_WIDTH_WORD_UNSIGNED     = 0b110,
    // 0b111 is reserved
  };

  static constexpr uint8_t OP_LOAD_MAJOR    = 0b0000011;
  static constexpr uint8_t OP_FP_LOAD_MAJOR = 0b0000111;

  template <uint8_t op_major, LoadWidthFunct3 width>
  void load_base(uint8_t Rd, Register Rs, const int32_t offset) {
    guarantee(is_simm12(offset), "offset is invalid.");
    unsigned insn = 0;
    int32_t val = offset & 0xfff;
    patch((address)&insn,  6,  0, op_major);
    patch((address)&insn, 11,  7, Rd);
    patch((address)&insn, 14, 12, width);
    patch_reg((address)&insn, 15, Rs);
    patch((address)&insn, 31, 20, val);
    emit(insn);
  }

  template <LoadWidthFunct3 width>
  void load_base(Register Rd, Register Rs, const int32_t offset) {
    load_base<OP_LOAD_MAJOR, width>(Rd->raw_encoding(), Rs, offset);
  }

  template <LoadWidthFunct3 width>
  void load_base(FloatRegister Rd, Register Rs, const int32_t offset) {
    load_base<OP_FP_LOAD_MAJOR, width>(Rd->raw_encoding(), Rs, offset);
  }

 public:

  void lb(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_BYTE>(Rd, Rs, offset);
  }

  void _lbu(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_BYTE_UNSIGNED>(Rd, Rs, offset);
  }

  void _lh(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_HALFWORD>(Rd, Rs, offset);
  }

  void _lhu(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_HALFWORD_UNSIGNED>(Rd, Rs, offset);
  }

  void _lw(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_WORD>(Rd, Rs, offset);
  }

  void lwu(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_WORD_UNSIGNED>(Rd, Rs, offset);
  }

  void _ld(Register Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_DOUBLEWORD>(Rd, Rs, offset);
  }

  void flh(FloatRegister Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_HALFWORD>(Rd, Rs, offset);
  }

  void flw(FloatRegister Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_WORD>(Rd, Rs, offset);
  }

  void _fld(FloatRegister Rd, Register Rs, const int32_t offset) {
    load_base<LOAD_WIDTH_DOUBLEWORD>(Rd, Rs, offset);
  }

#define INSN(NAME, op, funct3)                                                                           \
  void NAME(Register Rs1, Register Rs2, const int64_t offset) {                                          \
    guarantee(is_simm13(offset) && ((offset % 2) == 0), "offset is invalid.");                           \
    unsigned insn = 0;                                                                                   \
    uint32_t val  = offset & 0x1fff;                                                                     \
    uint32_t val11 = (val >> 11) & 0x1;                                                                  \
    uint32_t val12 = (val >> 12) & 0x1;                                                                  \
    uint32_t low  = (val >> 1) & 0xf;                                                                    \
    uint32_t high = (val >> 5) & 0x3f;                                                                   \
    patch((address)&insn, 6, 0, op);                                                                     \
    patch((address)&insn, 14, 12, funct3);                                                               \
    patch_reg((address)&insn, 15, Rs1);                                                                  \
    patch_reg((address)&insn, 20, Rs2);                                                                  \
    patch((address)&insn, 7, val11);                                                                     \
    patch((address)&insn, 11, 8, low);                                                                   \
    patch((address)&insn, 30, 25, high);                                                                 \
    patch((address)&insn, 31, val12);                                                                    \
    emit(insn);                                                                                          \
  }

  INSN(beq,  0b1100011, 0b000);
  INSN(bne,  0b1100011, 0b001);
  INSN(bge,  0b1100011, 0b101);
  INSN(bgeu, 0b1100011, 0b111);
  INSN(blt,  0b1100011, 0b100);
  INSN(bltu, 0b1100011, 0b110);

#undef INSN

 private:

  enum StoreWidthFunct3 : uint8_t {
    STORE_WIDTH_BYTE        = 0b000,
    STORE_WIDTH_HALFWORD    = 0b001,
    STORE_WIDTH_WORD        = 0b010,
    STORE_WIDTH_DOUBLEWORD  = 0b011,
    // 0b100 to 0b111 are reserved for this opcode
  };

  static constexpr uint8_t OP_STORE_MAJOR    = 0b0100011;
  static constexpr uint8_t OP_FP_STORE_MAJOR = 0b0100111;

  template <uint8_t op_code, StoreWidthFunct3 width>
  void store_base(uint8_t Rs2, Register Rs1, const int32_t offset) {
    guarantee(is_simm12(offset), "offset is invalid.");
    unsigned insn = 0;
    uint32_t val  = offset & 0xfff;
    uint32_t low  = val & 0x1f;
    uint32_t high = (val >> 5) & 0x7f;
    patch((address)&insn,  6,  0, op_code);
    patch((address)&insn, 11,  7, low);
    patch((address)&insn, 14, 12, width);
    patch_reg((address)&insn, 15, Rs1);
    patch((address)&insn, 24, 20, Rs2);
    patch((address)&insn, 31, 25, high);
    emit(insn);
  }

  template <StoreWidthFunct3 width>
  void store_base(Register Rs2, Register Rs1, const int32_t offset) {
    store_base<OP_STORE_MAJOR, width>(Rs2->raw_encoding(), Rs1, offset);
  }

  template <StoreWidthFunct3 width>
  void store_base(FloatRegister Rs2, Register Rs1, const int32_t offset) {
    store_base<OP_FP_STORE_MAJOR, width>(Rs2->raw_encoding(), Rs1, offset);
  }

 public:

  void _sb(Register Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_BYTE>(Rs2, Rs1, offset);
  }

  void _sh(Register Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_HALFWORD>(Rs2, Rs1, offset);
  }

  void _sw(Register Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_WORD>(Rs2, Rs1, offset);
  }

  void _sd(Register Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_DOUBLEWORD>(Rs2, Rs1, offset);
  }

  void fsw(FloatRegister Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_WORD>(Rs2, Rs1, offset);
  }

  void _fsd(FloatRegister Rs2, Register Rs1, const int32_t offset) {
    store_base<STORE_WIDTH_DOUBLEWORD>(Rs2, Rs1, offset);
  }

#define INSN(NAME, op, funct3)                                                        \
  void NAME(Register Rd, const uint32_t csr, Register Rs1) {                          \
    guarantee(is_uimm12(csr), "csr is invalid");                                      \
    unsigned insn = 0;                                                                \
    patch((address)&insn, 6, 0, op);                                                  \
    patch((address)&insn, 14, 12, funct3);                                            \
    patch_reg((address)&insn, 7, Rd);                                                 \
    patch_reg((address)&insn, 15, Rs1);                                               \
    patch((address)&insn, 31, 20, csr);                                               \
    emit(insn);                                                                       \
  }

  INSN(csrrw, 0b1110011, 0b001);
  INSN(csrrs, 0b1110011, 0b010);
  INSN(csrrc, 0b1110011, 0b011);

#undef INSN

#define INSN(NAME, op, funct3)                                                        \
  void NAME(Register Rd, const uint32_t csr, const uint32_t uimm) {                   \
    guarantee(is_uimm12(csr), "csr is invalid");                                      \
    guarantee(is_uimm5(uimm), "uimm is invalid");                                     \
    unsigned insn = 0;                                                                \
    uint32_t val  = uimm & 0x1f;                                                      \
    patch((address)&insn, 6, 0, op);                                                  \
    patch((address)&insn, 14, 12, funct3);                                            \
    patch_reg((address)&insn, 7, Rd);                                                 \
    patch((address)&insn, 19, 15, val);                                               \
    patch((address)&insn, 31, 20, csr);                                               \
    emit(insn);                                                                       \
  }

  INSN(csrrwi, 0b1110011, 0b101);
  INSN(csrrsi, 0b1110011, 0b110);
  INSN(csrrci, 0b1110011, 0b111);

#undef INSN

 private:
  // All calls and jumps must go via MASM.
  // Format J-type
  void _jal(Register Rd, const int32_t offset) {
    guarantee(is_simm21(offset) && ((offset % 2) == 0), "offset is invalid.");
    unsigned insn = 0;
    patch((address)&insn, 6, 0, 0b1101111);
    patch_reg((address)&insn, 7, Rd);
    patch((address)&insn, 19, 12, (uint32_t)((offset >> 12) & 0xff));
    patch((address)&insn, 20, (uint32_t)((offset >> 11) & 0x1));
    patch((address)&insn, 30, 21, (uint32_t)((offset >> 1) & 0x3ff));
    patch((address)&insn, 31, (uint32_t)((offset >> 20) & 0x1));
    emit(insn);
  }

  // Format I-type
  void _jalr(Register Rd, Register Rs, const int32_t offset) {
    guarantee(is_simm12(offset), "offset is invalid.");
    unsigned insn = 0;
    patch((address)&insn, 6, 0, 0b1100111);
    patch_reg((address)&insn, 7, Rd);
    patch((address)&insn, 14, 12, 0b000);
    patch_reg((address)&insn, 15, Rs);
    int32_t val = offset & 0xfff;
    patch((address)&insn, 31, 20, val);
    emit(insn);
  }

 protected:

  enum barrier {
    i = 0b1000, o = 0b0100, r = 0b0010, w = 0b0001,
    ir = i | r, ow = o | w, iorw = i | o | r | w
  };

  void fence(const uint32_t predecessor, const uint32_t successor) {
    unsigned insn = 0;
    guarantee(predecessor < 16, "predecessor is invalid");
    guarantee(successor < 16, "successor is invalid");
    patch((address)&insn, 6, 0, 0b001111);      // opcode
    patch((address)&insn, 11, 7, 0b00000);      // rd
    patch((address)&insn, 14, 12, 0b000);
    patch((address)&insn, 19, 15, 0b00000);     // rs1
    patch((address)&insn, 23, 20, successor);   // succ
    patch((address)&insn, 27, 24, predecessor); // pred
    patch((address)&insn, 31, 28, 0b0000);      // fm
    emit(insn);
  }

  void fencei() {
    unsigned insn = 0;
    patch((address)&insn,  6,  0, 0b0001111);      // opcode
    patch((address)&insn, 11,  7, 0b00000);        // rd
    patch((address)&insn, 14, 12, 0b001);          // func
    patch((address)&insn, 19, 15, 0b00000);        // rs1
    patch((address)&insn, 31, 20, 0b000000000000); // fm
    emit(insn);
  }

 public:

#define INSN(NAME, op, funct3, funct7)                      \
  void NAME() {                                             \
    unsigned insn = 0;                                      \
    patch((address)&insn, 6, 0, op);                        \
    patch((address)&insn, 11, 7, 0b00000);                  \
    patch((address)&insn, 14, 12, funct3);                  \
    patch((address)&insn, 19, 15, 0b00000);                 \
    patch((address)&insn, 31, 20, funct7);                  \
    emit(insn);                                             \
  }

  INSN(ecall,   0b1110011, 0b000, 0b000000000000);
  INSN(_ebreak, 0b1110011, 0b000, 0b000000000001);

#undef INSN

  enum Aqrl {relaxed = 0b00, rl = 0b01, aq = 0b10, aqrl = 0b11};

 private:

  enum AmoWidthFunct3 : uint8_t {
    AMO_WIDTH_BYTE        = 0b000, // Zabha extension
    AMO_WIDTH_HALFWORD    = 0b001, // Zabha extension
    AMO_WIDTH_WORD        = 0b010,
    AMO_WIDTH_DOUBLEWORD  = 0b011,
    AMO_WIDTH_QUADWORD    = 0b100,
    // 0b101 to 0b111 are reserved
  };

  enum AmoOperationFunct5 : uint8_t {
    AMO_ADD  = 0b00000,
    AMO_SWAP = 0b00001,
    AMO_LR   = 0b00010,
    AMO_SC   = 0b00011,
    AMO_XOR  = 0b00100,
    AMO_OR   = 0b01000,
    AMO_AND  = 0b01100,
    AMO_MIN  = 0b10000,
    AMO_MAX  = 0b10100,
    AMO_MINU = 0b11000,
    AMO_MAXU = 0b11100,
    AMO_CAS  = 0b00101 // Zacas
  };

  static constexpr uint32_t OP_AMO_MAJOR = 0b0101111;

  template <AmoOperationFunct5 funct5, AmoWidthFunct3 width>
  void amo_base(Register Rd, Register Rs1, uint8_t Rs2, Aqrl memory_order = aqrl) {
    assert(width > AMO_WIDTH_HALFWORD || UseZabha, "Must be");
    assert(funct5 != AMO_CAS || UseZacas, "Must be");
    unsigned insn = 0;
    patch((address)&insn,  6,  0, OP_AMO_MAJOR);
    patch_reg((address)&insn,  7, Rd);
    patch((address)&insn, 14, 12, width);
    patch_reg((address)&insn, 15, Rs1);
    patch((address)&insn, 24, 20, Rs2);
    patch((address)&insn, 26, 25, memory_order);
    patch((address)&insn, 31, 27, funct5);
    emit(insn);
  }

  template <AmoOperationFunct5 funct5, AmoWidthFunct3 width>
  void amo_base(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<funct5, width>(Rd, Rs1, Rs2->raw_encoding(), memory_order);
  }

 public:

  void amoadd_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_ADD, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amoadd_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_ADD, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoadd_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_ADD, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoadd_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_ADD, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoswap_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SWAP, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amoswap_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SWAP, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoswap_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SWAP, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoswap_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SWAP, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoxor_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_XOR, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amoxor_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_XOR, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoxor_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_XOR, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoxor_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_XOR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoor_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_OR, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amoor_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_OR, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoor_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_OR, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoor_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_OR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoand_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_AND, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amoand_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_AND, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoand_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_AND, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amoand_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_AND, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomin_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MIN, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amomin_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MIN, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomin_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MIN, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomin_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MIN, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amominu_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MINU, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amominu_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MINU, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amominu_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MINU, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amominu_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MINU, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomax_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAX, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amomax_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAX, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomax_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAX, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomax_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAX, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomaxu_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAXU, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amomaxu_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAXU, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomaxu_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAXU, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amomaxu_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_MAXU, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

 protected:

  void lr_w(Register Rd, Register Rs1, Aqrl memory_order = aqrl) {
    amo_base<AMO_LR, AMO_WIDTH_WORD>(Rd, Rs1, 0, memory_order);
  }

  void lr_d(Register Rd, Register Rs1, Aqrl memory_order = aqrl) {
    amo_base<AMO_LR, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, 0, memory_order);
  }

  void sc_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SC, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void sc_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_SC, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amocas_b(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_CAS, AMO_WIDTH_BYTE>(Rd, Rs1, Rs2, memory_order);
  }

  void amocas_h(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_CAS, AMO_WIDTH_HALFWORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amocas_w(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_CAS, AMO_WIDTH_WORD>(Rd, Rs1, Rs2, memory_order);
  }

  void amocas_d(Register Rd, Register Rs1, Register Rs2, Aqrl memory_order = aqrl) {
    amo_base<AMO_CAS, AMO_WIDTH_DOUBLEWORD>(Rd, Rs1, Rs2, memory_order);
  }

 public:

  enum operand_size { int8, int16, int32, uint32, int64 };

// Immediate Instruction
#define INSN(NAME, op, funct3)                                                              \
  void NAME(Register Rd, Register Rs1, int64_t imm) {                                       \
    guarantee(is_simm12(imm), "Immediate is out of validity");                              \
    unsigned insn = 0;                                                                      \
    patch((address)&insn, 6, 0, op);                                                        \
    patch((address)&insn, 14, 12, funct3);                                                  \
    patch((address)&insn, 31, 20, imm & 0x00000fff);                                        \
    patch_reg((address)&insn, 7, Rd);                                                       \
    patch_reg((address)&insn, 15, Rs1);                                                     \
    emit(insn);                                                                             \
  }

  INSN(_addi,  0b0010011, 0b000);
  INSN(_addiw, 0b0011011, 0b000);
  INSN(_andi,  0b0010011, 0b111);
  INSN(ori,    0b0010011, 0b110);
  INSN(xori,   0b0010011, 0b100);
  INSN(slti,   0b0010011, 0b010);

#undef INSN

#define INSN(NAME, op, funct3)                                                              \
  void NAME(Register Rd, Register Rs1, uint64_t imm) {                                      \
    guarantee(is_uimm12(imm), "Immediate is out of validity");                              \
    unsigned insn = 0;                                                                      \
    patch((address)&insn,6, 0,  op);                                                        \
    patch((address)&insn, 14, 12, funct3);                                                  \
    patch((address)&insn, 31, 20, imm & 0x00000fff);                                        \
    patch_reg((address)&insn, 7, Rd);                                                       \
    patch_reg((address)&insn, 15, Rs1);                                                     \
    emit(insn);                                                                             \
  }

  INSN(sltiu, 0b0010011, 0b011);

#undef INSN

// Shift Immediate Instruction
#define INSN(NAME, op, funct3, funct6)                                   \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {                 \
    guarantee(shamt <= 0x3f, "Shamt is invalid");                        \
    unsigned insn = 0;                                                   \
    patch((address)&insn, 6, 0, op);                                     \
    patch((address)&insn, 14, 12, funct3);                               \
    patch((address)&insn, 25, 20, shamt);                                \
    patch((address)&insn, 31, 26, funct6);                               \
    patch_reg((address)&insn, 7, Rd);                                    \
    patch_reg((address)&insn, 15, Rs1);                                  \
    emit(insn);                                                          \
  }

  INSN(_slli, 0b0010011, 0b001, 0b000000);
  INSN(_srai, 0b0010011, 0b101, 0b010000);
  INSN(_srli, 0b0010011, 0b101, 0b000000);

#undef INSN

// Shift Word Immediate Instruction
#define INSN(NAME, op, funct3, funct7)                                  \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {                \
    guarantee(shamt <= 0x1f, "Shamt is invalid");                       \
    unsigned insn = 0;                                                  \
    patch((address)&insn, 6, 0, op);                                    \
    patch((address)&insn, 14, 12, funct3);                              \
    patch((address)&insn, 24, 20, shamt);                               \
    patch((address)&insn, 31, 25, funct7);                              \
    patch_reg((address)&insn, 7, Rd);                                   \
    patch_reg((address)&insn, 15, Rs1);                                 \
    emit(insn);                                                         \
  }

  INSN(slliw, 0b0011011, 0b001, 0b0000000);
  INSN(sraiw, 0b0011011, 0b101, 0b0100000);
  INSN(srliw, 0b0011011, 0b101, 0b0000000);

#undef INSN

// Upper Immediate Instruction
#define INSN(NAME, op)                                                  \
  void NAME(Register Rd, int32_t imm) {                                 \
    int32_t upperImm = imm >> 12;                                       \
    unsigned insn = 0;                                                  \
    patch((address)&insn, 6, 0, op);                                    \
    patch_reg((address)&insn, 7, Rd);                                   \
    upperImm &= 0x000fffff;                                             \
    patch((address)&insn, 31, 12, upperImm);                            \
    emit(insn);                                                         \
  }

  INSN(_lui,  0b0110111);
  INSN(auipc, 0b0010111);

#undef INSN

// ==========================
// Floating Point Instructions
// ==========================
  static constexpr uint32_t OP_FP_MAJOR = 0b1010011;

  enum FmtPrecision : uint8_t {
    S_32_sp  = 0b00,
    D_64_dp  = 0b01,
    H_16_hp  = 0b10,
    Q_128_qp = 0b11
  };

 private:

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(uint8_t Rd, uint8_t Rs1, uint8_t Rs2, RoundingMode rm) {
    assert(Fmt != H_16_hp || UseZfh || UseZfhmin, "No half precision enabled");
    assert_cond(Fmt != Q_128_qp);
    guarantee(is_uimm3(rm), "Rounding mode is out of validity");
    guarantee(is_uimm2(Fmt), "FMT is out of validity");
    guarantee(is_uimm5(funct5), "Funct5 is out of validity");
    uint32_t insn = 0;
    patch((address)&insn,   6, 0, OP_FP_MAJOR);
    patch((address)&insn, 11,  7, Rd);
    patch((address)&insn, 14, 12, rm);
    patch((address)&insn, 19, 15, Rs1);
    patch((address)&insn, 24, 20, Rs2);
    patch((address)&insn, 26, 25, Fmt);
    patch((address)&insn, 31, 27, funct5);
    emit(insn);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm) {
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2->raw_encoding(), rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, int8_t rm) {
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2->raw_encoding(), (RoundingMode)rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(Register Rd, FloatRegister Rs1, FloatRegister Rs2, int8_t rm) {
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2->raw_encoding(), (RoundingMode)rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, FloatRegister Rs1, int8_t Rs2, int8_t rm) {
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2, (RoundingMode)rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, Register Rs1, FloatRegister Rs2, RoundingMode rm) {
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2->raw_encoding(), rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(Register Rd, FloatRegister Rs1, uint8_t Rs2, RoundingMode rm) {
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2, rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(Register Rd, FloatRegister Rs1, uint8_t Rs2, uint8_t rm) {
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2, (RoundingMode)rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, Register Rs1, uint8_t Rs2, RoundingMode rm) {
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2, rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, Register Rs1, uint8_t Rs2, int8_t rm) {
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1->raw_encoding(), Rs2, (RoundingMode)rm);
  }

  template <FmtPrecision Fmt, uint8_t funct5>
  void fp_base(FloatRegister Rd, uint8_t Rs1, uint8_t Rs2, int8_t rm) {
    guarantee(is_uimm5(Rs1), "Rs1 is out of validity");
    guarantee(is_uimm5(Rs2), "Rs2 is out of validity");
    fp_base<Fmt, funct5>(Rd->raw_encoding(), Rs1, Rs2, (RoundingMode)rm);
  }

 public:

  enum FClassBits {
    minf       = 1 << 0,   // negative infinite
    mnorm      = 1 << 1,   // negative normal number
    msubnorm   = 1 << 2,   // negative subnormal number
    mzero      = 1 << 3,   // negative zero
    pzero      = 1 << 4,   // positive zero
    psubnorm   = 1 << 5,   // positive subnormal number
    pnorm      = 1 << 6,   // positive normal number
    pinf       = 1 << 7,   // positive infinite
    snan       = 1 << 8,   // signaling NaN
    qnan       = 1 << 9,   // quiet NaN
    zero       = mzero    | pzero,
    subnorm    = msubnorm | psubnorm,
    norm       = mnorm    | pnorm,
    inf        = minf     | pinf,
    nan        = snan     | qnan,
    finite     = zero     | subnorm   | norm,
  };

  void fsqrt_s(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b01011>(Rd, Rs1, 0b00000, rm);
  }

  void fsqrt_d(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b01011>(Rd, Rs1, 0b00000, rm);
  }

  void fcvt_s_d(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b01000>(Rd, Rs1, 0b00001, rm);
  }

  void fcvt_d_s(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b01000>(Rd, Rs1, 0b00000, rm);
  }

  void fsgnj_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b00100>(Rd, Rs1, Rs2, 0b000);
  }

  void fsgnjn_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b00100>(Rd, Rs1, Rs2, 0b001);
  }

  void fsgnjx_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b00100>(Rd, Rs1, Rs2, 0b010);
  }

  void fmin_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b00101>(Rd, Rs1, Rs2, 0b000);
  }

  void fmax_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b00101>(Rd, Rs1, Rs2, 0b001);
  }

  void fsgnj_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b00100>(Rd, Rs1, Rs2, 0b000);
  }

  void fsgnjn_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b00100>(Rd, Rs1, Rs2, 0b001);
  }

  void fsgnjx_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b00100>(Rd, Rs1, Rs2, 0b010);
  }

  void fmin_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b00101>(Rd, Rs1, Rs2, 0b000);
  }

  void fmax_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b00101>(Rd, Rs1, Rs2, 0b001);
  }

  void feq_s(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b10100>(Rd, Rs1, Rs2, 0b010);
  }

  void flt_s(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b10100>(Rd, Rs1, Rs2, 0b001);
  }

  void fle_s(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<S_32_sp, 0b10100>(Rd, Rs1, Rs2, 0b000);
  }

  void feq_d(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b10100>(Rd, Rs1, Rs2, 0b010);
  }

  void fle_d(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b10100>(Rd, Rs1, Rs2, 0b000);
  }

  void flt_d(Register Rd, FloatRegister Rs1, FloatRegister Rs2) {
    fp_base<D_64_dp, 0b10100>(Rd, Rs1, Rs2, 0b001);
  }

  void fadd_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b00000>(Rd, Rs1, Rs2, rm);
  }

  void fsub_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b00001>(Rd, Rs1, Rs2, rm);
  }

  void fmul_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b00010>(Rd, Rs1, Rs2, rm);
  }

  void fdiv_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b00011>(Rd, Rs1, Rs2, rm);
  }

  void fadd_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b00000>(Rd, Rs1, Rs2, rm);
  }

  void fsub_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b00001>(Rd, Rs1, Rs2, rm);
  }

  void fmul_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b00010>(Rd, Rs1, Rs2, rm);
  }

  void fdiv_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b00011>(Rd, Rs1, Rs2, rm);
  }

  void fcvt_s_w(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b11010>(Rd, Rs1, 0b00000, rm);
  }

  void fcvt_s_wu(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b11010>(Rd, Rs1, 0b00001, rm);
  }

  void fcvt_s_l(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b11010>(Rd, Rs1, 0b00010, rm);
  }

  void fcvt_s_lu(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<S_32_sp, 0b11010>(Rd, Rs1, 0b00011, rm);
  }

  void fcvt_d_w(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b11010>(Rd, Rs1, 0b00000, rm);
  }

  void fcvt_d_wu(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b11010>(Rd, Rs1, 0b00001, rm);
  }

  void fcvt_d_l(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b11010>(Rd, Rs1, 0b00010, rm);
  }

  void fcvt_d_lu(FloatRegister Rd, Register Rs1, RoundingMode rm = rne) {
    fp_base<D_64_dp, 0b11010>(Rd, Rs1, 0b00011, rm);
  }

  void fcvt_w_s(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<S_32_sp, 0b11000>(Rd, Rs1, 0b00000, rm);
  }

  void fcvt_l_s(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<S_32_sp, 0b11000>(Rd, Rs1, 0b00010, rm);
  }

  void fcvt_wu_s(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<S_32_sp, 0b11000>(Rd, Rs1, 0b00001, rm);
  }

  void fcvt_lu_s(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<S_32_sp, 0b11000>(Rd, Rs1, 0b00011, rm);
  }

  void fcvt_w_d(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<D_64_dp, 0b11000>(Rd, Rs1, 0b00000, rm);
  }

  void fcvt_wu_d(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<D_64_dp, 0b11000>(Rd, Rs1, 0b00001, rm);
  }

  void fcvt_l_d(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<D_64_dp, 0b11000>(Rd, Rs1, 0b00010, rm);
  }

  void fcvt_lu_d(Register Rd, FloatRegister Rs1, RoundingMode rm = rtz) {
    fp_base<D_64_dp, 0b11000>(Rd, Rs1, 0b00011, rm);
  }

  void fmv_w_x(FloatRegister Rd, Register Rs1) {
    fp_base<S_32_sp, 0b11110>(Rd, Rs1, 0b00000, 0b000);
  }

  void fmv_d_x(FloatRegister Rd, Register Rs1) {
    fp_base<D_64_dp, 0b11110>(Rd, Rs1, 0b00000, 0b000);
  }

  void fclass_s(Register Rd, FloatRegister Rs1) {
    fp_base<S_32_sp, 0b11100>(Rd, Rs1, 0b00000, 0b001);
  }

  void fclass_d(Register Rd, FloatRegister Rs1) {
    fp_base<D_64_dp, 0b11100>(Rd, Rs1, 0b00000, 0b001);
  }

  void fmv_x_w(Register Rd, FloatRegister Rs1) {
    fp_base<S_32_sp, 0b11100>(Rd, Rs1, 0b00000, 0b000);
  }

  void fmv_x_d(Register Rd, FloatRegister Rs1) {
    fp_base<D_64_dp, 0b11100>(Rd, Rs1, 0b00000, 0b000);
  }

 private:
  template <FmtPrecision Fmt, uint8_t OpVal>
  void fp_fm(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm) {
    assert_cond(Fmt != Q_128_qp);
    guarantee(is_uimm3(rm), "Rounding mode is out of validity");
    guarantee(is_uimm2(Fmt), "FMT is out of validity");
    unsigned insn = 0;
    patch((address)&insn,   6, 0, OpVal);
    patch_reg((address)&insn,  7, Rd);
    patch((address)&insn, 14, 12, rm);
    patch_reg((address)&insn, 15, Rs1);
    patch_reg((address)&insn, 20, Rs2);
    patch((address)&insn, 26, 25, Fmt);
    patch_reg((address)&insn, 27, Rs3);
    emit(insn);
  }

 public:
  void fmadd_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne)  {
    fp_fm<S_32_sp, 0b1000011>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fmsub_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne)  {
    fp_fm<S_32_sp, 0b1000111>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fnmsub_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne) {
    fp_fm<S_32_sp, 0b1001011>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fnmadd_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne) {
    fp_fm<S_32_sp, 0b1001111>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fmadd_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne)  {
    fp_fm<D_64_dp, 0b1000011>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fmsub_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne)  {
    fp_fm<D_64_dp, 0b1000111>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fnmsub_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne) {
    fp_fm<D_64_dp, 0b1001011>(Rd, Rs1, Rs2, Rs3, rm);
  }

  void fnmadd_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne) {
    fp_fm<D_64_dp, 0b1001111>(Rd, Rs1, Rs2, Rs3, rm);
  }

// --------------  ZFH Instruction Definitions  --------------
// Zfh Standard Extensions for Half-Precision Floating-Point
  void fclass_h(Register Rd, FloatRegister Rs1) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b11100>(Rd, Rs1, 0b00000, 0b001);
  }

// Zfh and Zfhmin Half-Precision Floating-Point
  void fcvt_s_h(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    assert_cond(UseZfh || UseZfhmin);
    fp_base<S_32_sp, 0b01000>(Rd, Rs1, 0b00010, rm);
  }

  void fcvt_h_s(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    assert_cond(UseZfh || UseZfhmin);
    fp_base<H_16_hp, 0b01000>(Rd, Rs1, 0b00000, rm);
  }

  void fmv_h_x(FloatRegister Rd, Register Rs1) {
    assert_cond(UseZfh || UseZfhmin);
    fp_base<H_16_hp, 0b11110>(Rd, Rs1, 0b00000, 0b000);
  }

  void fmv_x_h(Register Rd, FloatRegister Rs1) {
    assert_cond(UseZfh || UseZfhmin);
    fp_base<H_16_hp, 0b11100>(Rd, Rs1, 0b00000, 0b000);
  }

  void fadd_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00000>(Rd, Rs1, Rs2, rm);
  }

  void fsub_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00001>(Rd, Rs1, Rs2, rm);
  }

  void fmul_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00010>(Rd, Rs1, Rs2, rm);
  }

  void fdiv_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, RoundingMode rm = rne) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00011>(Rd, Rs1, Rs2, rm);
  }

  void fsqrt_h(FloatRegister Rd, FloatRegister Rs1, RoundingMode rm = rne) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b01011>(Rd, Rs1, 0b00000, rm);
  }

  void fmin_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00101>(Rd, Rs1, Rs2, 0b000);
  }

  void fmax_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfh);
    fp_base<H_16_hp, 0b00101>(Rd, Rs1, Rs2, 0b001);
  }

  void fmadd_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2, FloatRegister Rs3, RoundingMode rm = rne)  {
    assert_cond(UseZfh);
    fp_fm<H_16_hp, 0b1000011>(Rd, Rs1, Rs2, Rs3, rm);
  }

// --------------  ZFA Instruction Definitions  --------------
// Zfa Extension for Additional Floating-Point Instructions
  void _fli_h(FloatRegister Rd, uint8_t Rs1) {
    assert_cond(UseZfa && UseZfh);
    fp_base<H_16_hp, 0b11110>(Rd, Rs1, 0b00001, 0b000);
  }

  void _fli_s(FloatRegister Rd, uint8_t Rs1) {
    assert_cond(UseZfa);
    fp_base<S_32_sp, 0b11110>(Rd, Rs1, 0b00001, 0b000);
  }

  void _fli_d(FloatRegister Rd, uint8_t Rs1) {
    assert_cond(UseZfa);
    fp_base<D_64_dp, 0b11110>(Rd, Rs1, 0b00001, 0b000);
  }

  void fminm_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa && UseZfh);
    fp_base<H_16_hp, 0b00101>(Rd, Rs1, Rs2, 0b010);
  }

  void fmaxm_h(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa && UseZfh);
    fp_base<H_16_hp, 0b00101>(Rd, Rs1, Rs2, 0b011);
  }

  void fminm_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa);
    fp_base<S_32_sp, 0b00101>(Rd, Rs1, Rs2, 0b010);
  }

  void fmaxm_s(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa);
    fp_base<S_32_sp, 0b00101>(Rd, Rs1, Rs2, 0b011);
  }

  void fminm_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa);
    fp_base<D_64_dp, 0b00101>(Rd, Rs1, Rs2, 0b010);
  }

  void fmaxm_d(FloatRegister Rd, FloatRegister Rs1, FloatRegister Rs2) {
    assert_cond(UseZfa);
    fp_base<D_64_dp, 0b00101>(Rd, Rs1, Rs2, 0b011);
  }

// ==========================
// RISC-V Vector Extension
// ==========================
enum SEW {
  e8,
  e16,
  e32,
  e64,
  RESERVED,
};

enum LMUL {
  mf8 = 0b101,
  mf4 = 0b110,
  mf2 = 0b111,
  m1  = 0b000,
  m2  = 0b001,
  m4  = 0b010,
  m8  = 0b011,
};

enum VMA {
  mu, // undisturbed
  ma, // agnostic
};

enum VTA {
  tu, // undisturbed
  ta, // agnostic
};

static Assembler::SEW elembytes_to_sew(int ebytes) {
  assert(ebytes > 0 && ebytes <= 8, "unsupported element size");
  return (Assembler::SEW) exact_log2(ebytes);
}

static Assembler::SEW elemtype_to_sew(BasicType etype) {
  return Assembler::elembytes_to_sew(type2aelembytes(etype));
}

#define patch_vtype(hsb, lsb, vlmul, vsew, vta, vma, vill)   \
    /* If vill then other bits of vtype must be zero. */     \
    guarantee(!vill, "vill not supported");                  \
    patch((address)&insn, lsb + 2, lsb, vlmul);              \
    patch((address)&insn, lsb + 5, lsb + 3, vsew);           \
    patch((address)&insn, lsb + 6, vta);                     \
    patch((address)&insn, lsb + 7, vma);                     \
    patch((address)&insn, hsb - 1, lsb + 8, 0);              \
    patch((address)&insn, hsb, vill)

#define INSN(NAME, op, funct3)                                            \
  void NAME(Register Rd, Register Rs1, SEW sew, LMUL lmul = m1,           \
            VMA vma = mu, VTA vta = tu, bool vill = false) {              \
    unsigned insn = 0;                                                    \
    patch((address)&insn, 6, 0, op);                                      \
    patch((address)&insn, 14, 12, funct3);                                \
    patch_vtype(30, 20, lmul, sew, vta, vma, vill);                       \
    patch((address)&insn, 31, 0);                                         \
    patch_reg((address)&insn, 7, Rd);                                     \
    patch_reg((address)&insn, 15, Rs1);                                   \
    emit(insn);                                                           \
  }

  INSN(vsetvli, 0b1010111, 0b111);

#undef INSN

#define INSN(NAME, op, funct3)                                            \
  void NAME(Register Rd, uint32_t imm, SEW sew, LMUL lmul = m1,           \
            VMA vma = mu, VTA vta = tu, bool vill = false) {              \
    unsigned insn = 0;                                                    \
    guarantee(is_uimm5(imm), "uimm is invalid");                           \
    patch((address)&insn, 6, 0, op);                                      \
    patch((address)&insn, 14, 12, funct3);                                \
    patch((address)&insn, 19, 15, imm);                                   \
    patch_vtype(29, 20, lmul, sew, vta, vma, vill);                       \
    patch((address)&insn, 31, 30, 0b11);                                  \
    patch_reg((address)&insn, 7, Rd);                                     \
    emit(insn);                                                           \
  }

  INSN(vsetivli, 0b1010111, 0b111);

#undef INSN

#undef patch_vtype

#define INSN(NAME, op, funct3, funct7)                          \
  void NAME(Register Rd, Register Rs1, Register Rs2) {          \
    unsigned insn = 0;                                          \
    patch((address)&insn, 6,  0, op);                           \
    patch((address)&insn, 14, 12, funct3);                      \
    patch((address)&insn, 31, 25, funct7);                      \
    patch_reg((address)&insn, 7, Rd);                           \
    patch_reg((address)&insn, 15, Rs1);                         \
    patch_reg((address)&insn, 20, Rs2);                         \
    emit(insn);                                                 \
  }

  // Vector Configuration Instruction
  INSN(vsetvl, 0b1010111, 0b111, 0b1000000);

#undef INSN

enum VectorMask {
  v0_t = 0b0,
  unmasked = 0b1
};

#define patch_VArith(op, Reg, funct3, Reg_or_Imm5, Vs2, vm, funct6)            \
    unsigned insn = 0;                                                         \
    patch((address)&insn, 6, 0, op);                                           \
    patch((address)&insn, 14, 12, funct3);                                     \
    patch((address)&insn, 19, 15, Reg_or_Imm5);                                \
    patch((address)&insn, 25, vm);                                             \
    patch((address)&insn, 31, 26, funct6);                                     \
    patch_reg((address)&insn, 7, Reg);                                         \
    patch_reg((address)&insn, 20, Vs2);                                        \
    emit(insn)

// r2_vm
#define INSN(NAME, op, funct3, Vs1, funct6)                                    \
  void NAME(Register Rd, VectorRegister Vs2, VectorMask vm = unmasked) {       \
    patch_VArith(op, Rd, funct3, Vs1, Vs2, vm, funct6);                        \
  }

  // Vector Mask
  INSN(vcpop_m,  0b1010111, 0b010, 0b10000, 0b010000);
  INSN(vfirst_m, 0b1010111, 0b010, 0b10001, 0b010000);
#undef INSN

#define INSN(NAME, op, funct3, Vs1, funct6)                                    \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6);                        \
  }

  // Vector Integer Extension
  INSN(vzext_vf2, 0b1010111, 0b010, 0b00110, 0b010010);
  INSN(vzext_vf4, 0b1010111, 0b010, 0b00100, 0b010010);
  INSN(vzext_vf8, 0b1010111, 0b010, 0b00010, 0b010010);
  INSN(vsext_vf2, 0b1010111, 0b010, 0b00111, 0b010010);
  INSN(vsext_vf4, 0b1010111, 0b010, 0b00101, 0b010010);
  INSN(vsext_vf8, 0b1010111, 0b010, 0b00011, 0b010010);

  // Vector Mask
  INSN(vmsbf_m,   0b1010111, 0b010, 0b00001, 0b010100);
  INSN(vmsif_m,   0b1010111, 0b010, 0b00011, 0b010100);
  INSN(vmsof_m,   0b1010111, 0b010, 0b00010, 0b010100);
  INSN(viota_m,   0b1010111, 0b010, 0b10000, 0b010100);

  // Vector Single-Width Floating-Point/Integer Type-Convert Instructions
  INSN(vfcvt_x_f_v,      0b1010111, 0b001, 0b00001, 0b010010);
  INSN(vfcvt_f_x_v,      0b1010111, 0b001, 0b00011, 0b010010);
  INSN(vfcvt_rtz_x_f_v,  0b1010111, 0b001, 0b00111, 0b010010);

  // Vector Widening Floating-Point/Integer Type-Convert Instructions
  INSN(vfwcvt_f_x_v,      0b1010111, 0b001, 0b01011, 0b010010);
  INSN(vfwcvt_f_f_v,      0b1010111, 0b001, 0b01100, 0b010010);
  INSN(vfwcvt_rtz_x_f_v,  0b1010111, 0b001, 0b01111, 0b010010);

  // Vector Narrowing Floating-Point/Integer Type-Convert Instructions
  INSN(vfncvt_f_x_w,      0b1010111, 0b001, 0b10011, 0b010010);
  INSN(vfncvt_f_f_w,      0b1010111, 0b001, 0b10100, 0b010010);
  INSN(vfncvt_rtz_x_f_w,  0b1010111, 0b001, 0b10111, 0b010010);

  // Vector Floating-Point Instruction
  INSN(vfsqrt_v,  0b1010111, 0b001, 0b00000, 0b010011);
  INSN(vfclass_v, 0b1010111, 0b001, 0b10000, 0b010011);

#undef INSN

// r2rd
#define INSN(NAME, op, funct3, simm5, vm, funct6)         \
  void NAME(VectorRegister Vd, VectorRegister Vs2) {      \
    patch_VArith(op, Vd, funct3, simm5, Vs2, vm, funct6); \
  }

  // Vector Whole Vector Register Move
  INSN(vmv1r_v, 0b1010111, 0b011, 0b00000, 0b1, 0b100111);
  INSN(vmv2r_v, 0b1010111, 0b011, 0b00001, 0b1, 0b100111);
  INSN(vmv4r_v, 0b1010111, 0b011, 0b00011, 0b1, 0b100111);
  INSN(vmv8r_v, 0b1010111, 0b011, 0b00111, 0b1, 0b100111);

#undef INSN

#define INSN(NAME, op, funct3, Vs1, vm, funct6)           \
  void NAME(FloatRegister Rd, VectorRegister Vs2) {       \
    patch_VArith(op, Rd, funct3, Vs1, Vs2, vm, funct6);   \
  }

  // Vector Floating-Point Move Instruction
  INSN(vfmv_f_s, 0b1010111, 0b001, 0b00000, 0b1, 0b010000);

#undef INSN

#define INSN(NAME, op, funct3, Vs1, vm, funct6)          \
  void NAME(Register Rd, VectorRegister Vs2) {           \
    patch_VArith(op, Rd, funct3, Vs1, Vs2, vm, funct6);  \
  }

  // Vector Integer Scalar Move Instructions
  INSN(vmv_x_s, 0b1010111, 0b010, 0b00000, 0b1, 0b010000);

#undef INSN

// r_vm
#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {       \
    guarantee(is_uimm5(imm), "uimm is invalid");                                                    \
    patch_VArith(op, Vd, funct3, (uint32_t)(imm & 0x1f), Vs2, vm, funct6);                         \
  }

  // Vector Single-Width Bit Shift Instructions
  INSN(vsra_vi,    0b1010111, 0b011, 0b101001);
  INSN(vsrl_vi,    0b1010111, 0b011, 0b101000);
  INSN(vsll_vi,    0b1010111, 0b011, 0b100101);

  // Vector Slide Instructions
  INSN(vslideup_vi,   0b1010111, 0b011, 0b001110);
  INSN(vslidedown_vi, 0b1010111, 0b011, 0b001111);

  // Vector Narrowing Integer Right Shift Instructions
  INSN(vnsra_wi, 0b1010111, 0b011, 0b101101);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs1, VectorRegister Vs2, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
  INSN(vfnmsub_vv, 0b1010111, 0b001, 0b101011);
  INSN(vfmsub_vv,  0b1010111, 0b001, 0b101010);
  INSN(vfnmadd_vv, 0b1010111, 0b001, 0b101001);
  INSN(vfmadd_vv,  0b1010111, 0b001, 0b101000);
  INSN(vfnmsac_vv, 0b1010111, 0b001, 0b101111);
  INSN(vfmsac_vv,  0b1010111, 0b001, 0b101110);
  INSN(vfmacc_vv,  0b1010111, 0b001, 0b101100);
  INSN(vfnmacc_vv, 0b1010111, 0b001, 0b101101);

  // Vector Single-Width Integer Multiply-Add Instructions
  INSN(vnmsub_vv, 0b1010111, 0b010, 0b101011);
  INSN(vmadd_vv,  0b1010111, 0b010, 0b101001);
  INSN(vnmsac_vv, 0b1010111, 0b010, 0b101111);
  INSN(vmacc_vv,  0b1010111, 0b010, 0b101101);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, Register Rs1, VectorRegister Vs2, VectorMask vm = unmasked) {       \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Single-Width Integer Multiply-Add Instructions
  INSN(vnmsub_vx, 0b1010111, 0b110, 0b101011);
  INSN(vmadd_vx,  0b1010111, 0b110, 0b101001);
  INSN(vnmsac_vx, 0b1010111, 0b110, 0b101111);
  INSN(vmacc_vx,  0b1010111, 0b110, 0b101101);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, FloatRegister Rs1, VectorRegister Vs2, VectorMask vm = unmasked) {  \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Single-Width Floating-Point Fused Multiply-Add Instructions
  INSN(vfnmsub_vf, 0b1010111, 0b101, 0b101011);
  INSN(vfmsub_vf,  0b1010111, 0b101, 0b101010);
  INSN(vfnmadd_vf, 0b1010111, 0b101, 0b101001);
  INSN(vfmadd_vf,  0b1010111, 0b101, 0b101000);
  INSN(vfnmsac_vf, 0b1010111, 0b101, 0b101111);
  INSN(vfmsac_vf,  0b1010111, 0b101, 0b101110);
  INSN(vfmacc_vf,  0b1010111, 0b101, 0b101100);
  INSN(vfnmacc_vf, 0b1010111, 0b101, 0b101101);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Single-Width Floating-Point Reduction Instructions
  INSN(vfredusum_vs,  0b1010111, 0b001, 0b000001);
  INSN(vfredosum_vs,  0b1010111, 0b001, 0b000011);
  INSN(vfredmin_vs,   0b1010111, 0b001, 0b000101);
  INSN(vfredmax_vs,   0b1010111, 0b001, 0b000111);

  // Vector Single-Width Integer Reduction Instructions
  INSN(vredsum_vs,    0b1010111, 0b010, 0b000000);
  INSN(vredand_vs,    0b1010111, 0b010, 0b000001);
  INSN(vredor_vs,     0b1010111, 0b010, 0b000010);
  INSN(vredxor_vs,    0b1010111, 0b010, 0b000011);
  INSN(vredminu_vs,   0b1010111, 0b010, 0b000100);
  INSN(vredmin_vs,    0b1010111, 0b010, 0b000101);
  INSN(vredmaxu_vs,   0b1010111, 0b010, 0b000110);
  INSN(vredmax_vs,    0b1010111, 0b010, 0b000111);

  // Vector Widening Integer Reduction Instructions
  INSN(vwredsum_vs,    0b1010111, 0b000, 0b110001);
  INSN(vwredsumu_vs,   0b1010111, 0b000, 0b110000);

  // Vector Floating-Point Compare Instructions
  INSN(vmfle_vv, 0b1010111, 0b001, 0b011001);
  INSN(vmflt_vv, 0b1010111, 0b001, 0b011011);
  INSN(vmfne_vv, 0b1010111, 0b001, 0b011100);
  INSN(vmfeq_vv, 0b1010111, 0b001, 0b011000);

  // Vector Floating-Point Sign-Injection Instructions
  INSN(vfsgnj_vv, 0b1010111, 0b001, 0b001000);
  INSN(vfsgnjx_vv, 0b1010111, 0b001, 0b001010);
  INSN(vfsgnjn_vv, 0b1010111, 0b001, 0b001001);

  // Vector Floating-Point MIN/MAX Instructions
  INSN(vfmax_vv,   0b1010111, 0b001, 0b000110);
  INSN(vfmin_vv,   0b1010111, 0b001, 0b000100);

  // Vector Single-Width Floating-Point Multiply/Divide Instructions
  INSN(vfdiv_vv,   0b1010111, 0b001, 0b100000);
  INSN(vfmul_vv,   0b1010111, 0b001, 0b100100);

  // Vector Single-Width Floating-Point Add/Subtract Instructions
  INSN(vfsub_vv, 0b1010111, 0b001, 0b000010);
  INSN(vfadd_vv, 0b1010111, 0b001, 0b000000);

  // Vector Single-Width Fractional Multiply with Rounding and Saturation
  INSN(vsmul_vv, 0b1010111, 0b000, 0b100111);

  // Vector Integer Divide Instructions
  INSN(vrem_vv,  0b1010111, 0b010, 0b100011);
  INSN(vremu_vv, 0b1010111, 0b010, 0b100010);
  INSN(vdiv_vv,  0b1010111, 0b010, 0b100001);
  INSN(vdivu_vv, 0b1010111, 0b010, 0b100000);

  // Vector Single-Width Integer Multiply Instructions
  INSN(vmulhsu_vv, 0b1010111, 0b010, 0b100110);
  INSN(vmulhu_vv,  0b1010111, 0b010, 0b100100);
  INSN(vmulh_vv,   0b1010111, 0b010, 0b100111);
  INSN(vmul_vv,    0b1010111, 0b010, 0b100101);

  // Vector Widening Integer Multiply Instructions
  INSN(vwmul_vv,    0b1010111, 0b010, 0b111011);
  INSN(vwmulu_vv,   0b1010111, 0b010, 0b111000);

  // Vector Integer Min/Max Instructions
  INSN(vmax_vv,  0b1010111, 0b000, 0b000111);
  INSN(vmaxu_vv, 0b1010111, 0b000, 0b000110);
  INSN(vmin_vv,  0b1010111, 0b000, 0b000101);
  INSN(vminu_vv, 0b1010111, 0b000, 0b000100);

  // Vector Integer Comparison Instructions
  INSN(vmsle_vv,  0b1010111, 0b000, 0b011101);
  INSN(vmsleu_vv, 0b1010111, 0b000, 0b011100);
  INSN(vmslt_vv,  0b1010111, 0b000, 0b011011);
  INSN(vmsltu_vv, 0b1010111, 0b000, 0b011010);
  INSN(vmsne_vv,  0b1010111, 0b000, 0b011001);
  INSN(vmseq_vv,  0b1010111, 0b000, 0b011000);

  // Vector Single-Width Bit Shift Instructions
  INSN(vsra_vv, 0b1010111, 0b000, 0b101001);
  INSN(vsrl_vv, 0b1010111, 0b000, 0b101000);
  INSN(vsll_vv, 0b1010111, 0b000, 0b100101);

  // Vector Bitwise Logical Instructions
  INSN(vxor_vv, 0b1010111, 0b000, 0b001011);
  INSN(vor_vv,  0b1010111, 0b000, 0b001010);
  INSN(vand_vv, 0b1010111, 0b000, 0b001001);

  // Vector Single-Width Integer Add and Subtract
  INSN(vadd_vv, 0b1010111, 0b000, 0b000000);
  INSN(vsub_vv, 0b1010111, 0b000, 0b000010);

  // Vector Saturating Integer Add and Subtract
  INSN(vsadd_vv,  0b1010111, 0b000, 0b100001);
  INSN(vsaddu_vv, 0b1010111, 0b000, 0b100000);
  INSN(vssub_vv,  0b1010111, 0b000, 0b100011);
  INSN(vssubu_vv, 0b1010111, 0b000, 0b100010);

  // Vector Register Gather Instructions
  INSN(vrgather_vv,     0b1010111, 0b000, 0b001100);

#undef INSN


#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, Register Rs1, VectorMask vm = unmasked) {       \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Integer Divide Instructions
  INSN(vrem_vx,  0b1010111, 0b110, 0b100011);
  INSN(vremu_vx, 0b1010111, 0b110, 0b100010);
  INSN(vdiv_vx,  0b1010111, 0b110, 0b100001);
  INSN(vdivu_vx, 0b1010111, 0b110, 0b100000);

  // Vector Single-Width Integer Multiply Instructions
  INSN(vmulhsu_vx, 0b1010111, 0b110, 0b100110);
  INSN(vmulhu_vx,  0b1010111, 0b110, 0b100100);
  INSN(vmulh_vx,   0b1010111, 0b110, 0b100111);
  INSN(vmul_vx,    0b1010111, 0b110, 0b100101);

  // Vector Widening Integer Add/Subtract
  INSN(vwadd_vx, 0b1010111, 0b110, 0b110001);

  // Vector Integer Min/Max Instructions
  INSN(vmax_vx,  0b1010111, 0b100, 0b000111);
  INSN(vmaxu_vx, 0b1010111, 0b100, 0b000110);
  INSN(vmin_vx,  0b1010111, 0b100, 0b000101);
  INSN(vminu_vx, 0b1010111, 0b100, 0b000100);

  // Vector Integer Comparison Instructions
  INSN(vmsgt_vx,  0b1010111, 0b100, 0b011111);
  INSN(vmsgtu_vx, 0b1010111, 0b100, 0b011110);
  INSN(vmsle_vx,  0b1010111, 0b100, 0b011101);
  INSN(vmsleu_vx, 0b1010111, 0b100, 0b011100);
  INSN(vmslt_vx,  0b1010111, 0b100, 0b011011);
  INSN(vmsltu_vx, 0b1010111, 0b100, 0b011010);
  INSN(vmsne_vx,  0b1010111, 0b100, 0b011001);
  INSN(vmseq_vx,  0b1010111, 0b100, 0b011000);

  // Vector Narrowing Integer Right Shift Instructions
  INSN(vnsra_wx, 0b1010111, 0b100, 0b101101);
  INSN(vnsrl_wx, 0b1010111, 0b100, 0b101100);

  // Vector Single-Width Bit Shift Instructions
  INSN(vsra_vx, 0b1010111, 0b100, 0b101001);
  INSN(vsrl_vx, 0b1010111, 0b100, 0b101000);
  INSN(vsll_vx, 0b1010111, 0b100, 0b100101);

  // Vector Bitwise Logical Instructions
  INSN(vxor_vx, 0b1010111, 0b100, 0b001011);
  INSN(vor_vx,  0b1010111, 0b100, 0b001010);
  INSN(vand_vx, 0b1010111, 0b100, 0b001001);

  // Vector Single-Width Integer Add and Subtract
  INSN(vsub_vx,  0b1010111, 0b100, 0b000010);
  INSN(vadd_vx,  0b1010111, 0b100, 0b000000);

  // Vector Integer reverse subtract
  INSN(vrsub_vx, 0b1010111, 0b100, 0b000011);

  // Vector Slide Instructions
  INSN(vslidedown_vx, 0b1010111, 0b100, 0b001111);

#undef INSN

#define INSN(NAME, op, funct3, vm, funct6)                                                         \
  void NAME(VectorRegister Vd, VectorRegister Vs2, Register Rs1) {                                 \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Integer Merge Instructions
  INSN(vmerge_vxm,  0b1010111, 0b100, 0b0, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, vm, funct6)                                                         \
  void NAME(VectorRegister Vd, VectorRegister Vs2, FloatRegister Rs1) {                            \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Floating-Point Merge Instruction
  INSN(vfmerge_vfm,  0b1010111, 0b101, 0b0, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, FloatRegister Rs1, VectorMask vm = unmasked) {  \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Floating-Point Compare Instructions
  INSN(vmfge_vf, 0b1010111, 0b101, 0b011111);
  INSN(vmfgt_vf, 0b1010111, 0b101, 0b011101);
  INSN(vmfle_vf, 0b1010111, 0b101, 0b011001);
  INSN(vmflt_vf, 0b1010111, 0b101, 0b011011);
  INSN(vmfne_vf, 0b1010111, 0b101, 0b011100);
  INSN(vmfeq_vf, 0b1010111, 0b101, 0b011000);

  // Vector Floating-Point MIN/MAX Instructions
  INSN(vfmax_vf, 0b1010111, 0b101, 0b000110);
  INSN(vfmin_vf, 0b1010111, 0b101, 0b000100);

  // Vector Single-Width Floating-Point Multiply/Divide Instructions
  INSN(vfdiv_vf,  0b1010111, 0b101, 0b100000);
  INSN(vfmul_vf,  0b1010111, 0b101, 0b100100);
  INSN(vfrdiv_vf, 0b1010111, 0b101, 0b100001);

  // Vector Single-Width Floating-Point Add/Subtract Instructions
  INSN(vfsub_vf,  0b1010111, 0b101, 0b000010);
  INSN(vfadd_vf,  0b1010111, 0b101, 0b000000);
  INSN(vfrsub_vf, 0b1010111, 0b101, 0b100111);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, int32_t imm, VectorMask vm = unmasked) {        \
    guarantee(is_simm5(imm), "imm is invalid");                                                    \
    patch_VArith(op, Vd, funct3, (uint32_t)(imm & 0x1f), Vs2, vm, funct6);                         \
  }

  // Vector Integer Comparison Instructions
  INSN(vmsgt_vi,  0b1010111, 0b011, 0b011111);
  INSN(vmsgtu_vi, 0b1010111, 0b011, 0b011110);
  INSN(vmsle_vi,  0b1010111, 0b011, 0b011101);
  INSN(vmsleu_vi, 0b1010111, 0b011, 0b011100);
  INSN(vmsne_vi,  0b1010111, 0b011, 0b011001);
  INSN(vmseq_vi,  0b1010111, 0b011, 0b011000);

  // Vector Bitwise Logical Instructions
  INSN(vxor_vi,   0b1010111, 0b011, 0b001011);
  INSN(vor_vi,    0b1010111, 0b011, 0b001010);
  INSN(vand_vi,   0b1010111, 0b011, 0b001001);

  // Vector Single-Width Integer Add and Subtract
  INSN(vadd_vi,   0b1010111, 0b011, 0b000000);

  // Vector Integer reverse subtract
  INSN(vrsub_vi,  0b1010111, 0b011, 0b000011);

#undef INSN

#define INSN(NAME, op, funct3, vm, funct6)                                    \
  void NAME(VectorRegister Vd, VectorRegister Vs2, int32_t imm) {             \
    guarantee(is_simm5(imm), "imm is invalid");                               \
    patch_VArith(op, Vd, funct3, (uint32_t)(imm & 0x1f), Vs2, vm, funct6);    \
  }

  // Vector Integer Merge Instructions
  INSN(vmerge_vim,  0b1010111, 0b011, 0b0, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, vm, funct6)                                   \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) {     \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);      \
  }

  // Vector Compress Instruction
  INSN(vcompress_vm, 0b1010111, 0b010, 0b1, 0b010111);

  // Vector Mask-Register Logical Instructions
  INSN(vmxnor_mm,   0b1010111, 0b010, 0b1, 0b011111);
  INSN(vmorn_mm,    0b1010111, 0b010, 0b1, 0b011100);
  INSN(vmnor_mm,    0b1010111, 0b010, 0b1, 0b011110);
  INSN(vmor_mm,     0b1010111, 0b010, 0b1, 0b011010);
  INSN(vmxor_mm,    0b1010111, 0b010, 0b1, 0b011011);
  INSN(vmandn_mm,   0b1010111, 0b010, 0b1, 0b011000);
  INSN(vmnand_mm,   0b1010111, 0b010, 0b1, 0b011101);
  INSN(vmand_mm,    0b1010111, 0b010, 0b1, 0b011001);

  // Vector Integer Merge Instructions
  INSN(vmerge_vvm,  0b1010111, 0b000, 0b0, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, Vs2, vm, funct6)                            \
  void NAME(VectorRegister Vd, int32_t imm) {                              \
    guarantee(is_simm5(imm), "imm is invalid");                            \
    patch_VArith(op, Vd, funct3, (uint32_t)(imm & 0x1f), Vs2, vm, funct6); \
  }

  // Vector Integer Move Instructions
  INSN(vmv_v_i, 0b1010111, 0b011, v0, 0b1, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, Vs2, vm, funct6)                             \
  void NAME(VectorRegister Vd, FloatRegister Rs1) {                         \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);     \
  }

  // Floating-Point Scalar Move Instructions
  INSN(vfmv_s_f, 0b1010111, 0b101, v0, 0b1, 0b010000);
  // Vector Floating-Point Move Instruction
  INSN(vfmv_v_f, 0b1010111, 0b101, v0, 0b1, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, Vs2, vm, funct6)                             \
  void NAME(VectorRegister Vd, VectorRegister Vs1) {                        \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);     \
  }

  // Vector Integer Move Instructions
  INSN(vmv_v_v, 0b1010111, 0b000, v0, 0b1, 0b010111);

#undef INSN

#define INSN(NAME, op, funct3, Vs2, vm, funct6)                             \
   void NAME(VectorRegister Vd, Register Rs1) {                             \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);     \
   }

  // Integer Scalar Move Instructions
  INSN(vmv_s_x, 0b1010111, 0b110, v0, 0b1, 0b010000);

  // Vector Integer Move Instructions
  INSN(vmv_v_x, 0b1010111, 0b100, v0, 0b1, 0b010111);

#undef INSN

#define INSN(NAME, op, funct13, funct6)                    \
  void NAME(VectorRegister Vd, VectorMask vm = unmasked) { \
    unsigned insn = 0;                                     \
    patch((address)&insn, 6, 0, op);                       \
    patch((address)&insn, 24, 12, funct13);                \
    patch((address)&insn, 25, vm);                         \
    patch((address)&insn, 31, 26, funct6);                 \
    patch_reg((address)&insn, 7, Vd);                      \
    emit(insn);                                            \
  }

  // Vector Element Index Instruction
  INSN(vid_v, 0b1010111, 0b0000010001010, 0b010100);

#undef INSN

enum Nf {
  g1 = 0b000,
  g2 = 0b001,
  g3 = 0b010,
  g4 = 0b011,
  g5 = 0b100,
  g6 = 0b101,
  g7 = 0b110,
  g8 = 0b111
};

#define patch_VLdSt(op, VReg, width, Rs1, Reg_or_umop, vm, mop, mew, nf) \
    unsigned insn = 0;                                                   \
    patch((address)&insn, 6, 0, op);                                     \
    patch((address)&insn, 14, 12, width);                                \
    patch((address)&insn, 24, 20, Reg_or_umop);                          \
    patch((address)&insn, 25, vm);                                       \
    patch((address)&insn, 27, 26, mop);                                  \
    patch((address)&insn, 28, mew);                                      \
    patch((address)&insn, 31, 29, nf);                                   \
    patch_reg((address)&insn, 7, VReg);                                  \
    patch_reg((address)&insn, 15, Rs1);                                  \
    emit(insn)

#define INSN(NAME, op, width, lumop, vm, mop, mew, nf)                               \
  void NAME(VectorRegister Vd, Register Rs1) {                                       \
    guarantee(is_uimm3(width), "width is invalid");                                  \
    patch_VLdSt(op, Vd, width, Rs1, lumop, vm, mop, mew, nf);                        \
  }

  // Vector Load/Store Instructions
  INSN(vl1re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
  INSN(vl1re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g1);
  INSN(vl1re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g1);
  INSN(vl1re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g1);
  INSN(vl2re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
  INSN(vl2re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g2);
  INSN(vl2re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g2);
  INSN(vl2re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g2);
  INSN(vl4re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
  INSN(vl4re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g4);
  INSN(vl4re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g4);
  INSN(vl4re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g4);
  INSN(vl8re8_v,  0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);
  INSN(vl8re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g8);
  INSN(vl8re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g8);
  INSN(vl8re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g8);

#undef INSN

#define INSN(NAME, op, width, sumop, vm, mop, mew, nf)           \
  void NAME(VectorRegister Vs3, Register Rs1) {                  \
    patch_VLdSt(op, Vs3, width, Rs1, sumop, vm, mop, mew, nf);   \
  }

  // Vector Load/Store Instructions
  INSN(vs1r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1);
  INSN(vs2r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2);
  INSN(vs4r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4);
  INSN(vs8r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8);

#undef INSN

// r2_nfvm
#define INSN(NAME, op, width, umop, mop, mew)                         \
  void NAME(VectorRegister Vd_or_Vs3, Register Rs1, Nf nf = g1) {     \
    patch_VLdSt(op, Vd_or_Vs3, width, Rs1, umop, 1, mop, mew, nf);    \
  }

  // Vector Unit-Stride Instructions
  INSN(vlm_v, 0b0000111, 0b000, 0b01011, 0b00, 0b0);
  INSN(vsm_v, 0b0100111, 0b000, 0b01011, 0b00, 0b0);

#undef INSN

#define INSN(NAME, op, width, umop, mop, mew)                                               \
  void NAME(VectorRegister Vd_or_Vs3, Register Rs1, VectorMask vm = unmasked, Nf nf = g1) { \
    patch_VLdSt(op, Vd_or_Vs3, width, Rs1, umop, vm, mop, mew, nf);                         \
  }

  // Vector Unit-Stride Instructions
  INSN(vle8_v,    0b0000111, 0b000, 0b00000, 0b00, 0b0);
  INSN(vle16_v,   0b0000111, 0b101, 0b00000, 0b00, 0b0);
  INSN(vle32_v,   0b0000111, 0b110, 0b00000, 0b00, 0b0);
  INSN(vle64_v,   0b0000111, 0b111, 0b00000, 0b00, 0b0);

  // Vector unit-stride fault-only-first Instructions
  INSN(vle8ff_v,  0b0000111, 0b000, 0b10000, 0b00, 0b0);
  INSN(vle16ff_v, 0b0000111, 0b101, 0b10000, 0b00, 0b0);
  INSN(vle32ff_v, 0b0000111, 0b110, 0b10000, 0b00, 0b0);
  INSN(vle64ff_v, 0b0000111, 0b111, 0b10000, 0b00, 0b0);

  INSN(vse8_v,  0b0100111, 0b000, 0b00000, 0b00, 0b0);
  INSN(vse16_v, 0b0100111, 0b101, 0b00000, 0b00, 0b0);
  INSN(vse32_v, 0b0100111, 0b110, 0b00000, 0b00, 0b0);
  INSN(vse64_v, 0b0100111, 0b111, 0b00000, 0b00, 0b0);

#undef INSN

#define INSN(NAME, op, width, umop, mop, mew, nf)                                               \
  void NAME(VectorRegister Vd_or_Vs3, Register Rs1, VectorMask vm = unmasked) { \
    patch_VLdSt(op, Vd_or_Vs3, width, Rs1, umop, vm, mop, mew, nf);                         \
  }

  // Vector Unit-Stride Segment Load Instructions
  INSN(vlseg3e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g3);
  INSN(vlseg4e8_v, 0b0000111, 0b000, 0b00000, 0b00, 0b0, g4);

  // Vector Unit-Stride Segment Store Instructions
  INSN(vsseg3e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g3);
  INSN(vsseg4e8_v, 0b0100111, 0b000, 0b00000, 0b00, 0b0, g4);

#undef INSN

#define INSN(NAME, op, width, mop, mew)                                                                  \
  void NAME(VectorRegister Vd, Register Rs1, VectorRegister Vs2, VectorMask vm = unmasked, Nf nf = g1) { \
    patch_VLdSt(op, Vd, width, Rs1, Vs2->raw_encoding(), vm, mop, mew, nf);                              \
  }

  // Vector unordered indexed load instructions
  INSN( vluxei8_v, 0b0000111, 0b000, 0b01, 0b0);
  INSN(vluxei32_v, 0b0000111, 0b110, 0b01, 0b0);
  INSN(vluxei64_v, 0b0000111, 0b111, 0b01, 0b0);

  // Vector unordered indexed store instructions
  INSN( vsuxei8_v, 0b0100111, 0b000, 0b01, 0b0);
  INSN(vsuxei32_v, 0b0100111, 0b110, 0b01, 0b0);
  INSN(vsuxei64_v, 0b0100111, 0b111, 0b01, 0b0);

#undef INSN

#define INSN(NAME, op, width, mop, mew)                                                                  \
  void NAME(VectorRegister Vd, Register Rs1, Register Rs2, VectorMask vm = unmasked, Nf nf = g1) {       \
    patch_VLdSt(op, Vd, width, Rs1, Rs2->raw_encoding(), vm, mop, mew, nf);                              \
  }

  // Vector Strided Instructions
  INSN(vlse8_v,  0b0000111, 0b000, 0b10, 0b0);
  INSN(vlse16_v, 0b0000111, 0b101, 0b10, 0b0);
  INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
  INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);

  INSN(vsse8_v,  0b0100111, 0b000, 0b10, 0b0);
  INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
  INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
  INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);

#undef INSN
#undef patch_VLdSt

// ====================================
// RISC-V Vector Crypto Extension
// ====================================

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Bit-manipulation used in Cryptography (Zvbb) Extension
  INSN(vandn_vv,   0b1010111, 0b000, 0b000001);
  INSN(vror_vv,    0b1010111, 0b000, 0b010100);
  INSN(vrol_vv,    0b1010111, 0b000, 0b010101);

  // Vector Bit-manipulation used in Cryptography (Zvbc) Extension
  INSN(vclmul_vv,  0b1010111, 0b010, 0b001100);
  INSN(vclmulh_vv, 0b1010111, 0b010, 0b001101);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, Register Rs1, VectorMask vm = unmasked) {       \
    patch_VArith(op, Vd, funct3, Rs1->raw_encoding(), Vs2, vm, funct6);                            \
  }

  // Vector Bit-manipulation used in Cryptography (Zvbb) Extension
  INSN(vandn_vx,   0b1010111, 0b100, 0b000001);
  INSN(vrol_vx,    0b1010111, 0b100, 0b010101);
  INSN(vror_vx,    0b1010111, 0b100, 0b010100);

#undef INSN

#define patch_VArith_imm6(op, Reg, funct3, Reg_or_Imm5, I5, Vs2, vm, funct6)   \
    unsigned insn = 0;                                                         \
    patch((address)&insn, 6, 0, op);                                           \
    patch((address)&insn, 14, 12, funct3);                                     \
    patch((address)&insn, 19, 15, Reg_or_Imm5);                                \
    patch((address)&insn, 25, vm);                                             \
    patch((address)&insn, 26, I5);                                             \
    patch((address)&insn, 31, 27, funct6);                                     \
    patch_reg((address)&insn, 7, Reg);                                         \
    patch_reg((address)&insn, 20, Vs2);                                        \
    emit(insn)

#define INSN(NAME, op, funct3, funct6)                                                             \
  void NAME(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) {       \
    guarantee(is_uimm6(imm), "uimm is invalid");                                                   \
    patch_VArith_imm6(op, Vd, funct3, (uint32_t)(imm & 0x1f), (uint32_t)((imm >> 5) & 0x1), Vs2, vm, funct6);  \
  }

  // Vector Bit-manipulation used in Cryptography (Zvbb) Extension
  // NOTE: there is no corresponding vrol.vi supplied by the extension, but it can be emulated with vror.vi easily.
  INSN(vror_vi,    0b1010111, 0b011, 0b01010);

#undef INSN
#undef patch_VArith_imm6

#define INSN(NAME, op, funct3, Vs1, funct6)                                    \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6);                        \
  }

  // Vector Bit-manipulation used in Cryptography (Zvkb) Extension
  INSN(vbrev_v,  0b1010111, 0b010, 0b01010, 0b010010); // reverse bits in every element
  INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010); // reverse bits in every byte of element
  INSN(vrev8_v,  0b1010111, 0b010, 0b01001, 0b010010); // reverse bytes in every elememt

  // Vector AES instructions (Zvkned extension)
  INSN(vaesem_vv,   0b1110111, 0b010, 0b00010, 0b101000);
  INSN(vaesef_vv,   0b1110111, 0b010, 0b00011, 0b101000);

  INSN(vaesdm_vv,   0b1110111, 0b010, 0b00000, 0b101000);
  INSN(vaesdf_vv,   0b1110111, 0b010, 0b00001, 0b101000);

  INSN(vclz_v,  0b1010111, 0b010, 0b01100, 0b010010); // count leading zeros
  INSN(vctz_v,  0b1010111, 0b010, 0b01101, 0b010010); // count trailing zeros

#undef INSN

#define INSN(NAME, op, funct3, vm, funct6)                                   \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) {     \
    patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6);      \
  }

  // Vector SHA-2 Secure Hash (Zvknh[ab]) Extension
  INSN(vsha2ms_vv,  0b1110111, 0b010, 0b1, 0b101101);
  INSN(vsha2ch_vv,  0b1110111, 0b010, 0b1, 0b101110);
  INSN(vsha2cl_vv,  0b1110111, 0b010, 0b1, 0b101111);

#undef INSN

#define INSN(NAME, op, funct3, Vs1, funct6)                                    \
  void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \
    patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6);                        \
  }

  // Vector Basic Bit-manipulation (Zvbb) Extension
  INSN(vcpop_v,  0b1010111, 0b010, 0b01110, 0b010010);

#undef INSN

#undef patch_VArith

// ====================================
// RISC-V Bit-Manipulation Extension
// Currently only support Zba, Zbb and Zbs bitmanip extensions.
// ====================================
#define INSN(NAME, op, funct3, funct7)                  \
  void NAME(Register Rd, Register Rs1, Register Rs2) {  \
    unsigned insn = 0;                                  \
    patch((address)&insn, 6,  0, op);                   \
    patch((address)&insn, 14, 12, funct3);              \
    patch((address)&insn, 31, 25, funct7);              \
    patch_reg((address)&insn, 7, Rd);                   \
    patch_reg((address)&insn, 15, Rs1);                 \
    patch_reg((address)&insn, 20, Rs2);                 \
    emit(insn);                                         \
  }

  INSN(add_uw,    0b0111011, 0b000, 0b0000100);
  INSN(rolr,      0b0110011, 0b001, 0b0110000);
  INSN(rolrw,     0b0111011, 0b001, 0b0110000);
  INSN(rorr,      0b0110011, 0b101, 0b0110000);
  INSN(rorrw,     0b0111011, 0b101, 0b0110000);
  INSN(sh1add,    0b0110011, 0b010, 0b0010000);
  INSN(sh2add,    0b0110011, 0b100, 0b0010000);
  INSN(sh3add,    0b0110011, 0b110, 0b0010000);
  INSN(sh1add_uw, 0b0111011, 0b010, 0b0010000);
  INSN(sh2add_uw, 0b0111011, 0b100, 0b0010000);
  INSN(sh3add_uw, 0b0111011, 0b110, 0b0010000);
  INSN(andn,      0b0110011, 0b111, 0b0100000);
  INSN(orn,       0b0110011, 0b110, 0b0100000);
  INSN(xnor,      0b0110011, 0b100, 0b0100000);
  INSN(max,       0b0110011, 0b110, 0b0000101);
  INSN(maxu,      0b0110011, 0b111, 0b0000101);
  INSN(min,       0b0110011, 0b100, 0b0000101);
  INSN(minu,      0b0110011, 0b101, 0b0000101);

#undef INSN

#define INSN(NAME, op, funct3, funct12)                 \
  void NAME(Register Rd, Register Rs1) {                \
    unsigned insn = 0;                                  \
    patch((address)&insn, 6, 0, op);                    \
    patch((address)&insn, 14, 12, funct3);              \
    patch((address)&insn, 31, 20, funct12);             \
    patch_reg((address)&insn, 7, Rd);                   \
    patch_reg((address)&insn, 15, Rs1);                 \
    emit(insn);                                         \
  }

  INSN(brev8,  0b0010011, 0b101, 0b011010000111);
  INSN(rev8,   0b0010011, 0b101, 0b011010111000);
  INSN(_sext_b, 0b0010011, 0b001, 0b011000000100);
  INSN(_sext_h, 0b0010011, 0b001, 0b011000000101);
  INSN(_zext_h, 0b0111011, 0b100, 0b000010000000);
  INSN(clz,    0b0010011, 0b001, 0b011000000000);
  INSN(clzw,   0b0011011, 0b001, 0b011000000000);
  INSN(ctz,    0b0010011, 0b001, 0b011000000001);
  INSN(ctzw,   0b0011011, 0b001, 0b011000000001);
  INSN(cpop,   0b0010011, 0b001, 0b011000000010);
  INSN(cpopw,  0b0011011, 0b001, 0b011000000010);
  INSN(orc_b,  0b0010011, 0b101, 0b001010000111);

#undef INSN

#define INSN(NAME, op, funct3, funct6)                  \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {\
    guarantee(shamt <= 0x3f, "Shamt is invalid");       \
    unsigned insn = 0;                                  \
    patch((address)&insn, 6, 0, op);                    \
    patch((address)&insn, 14, 12, funct3);              \
    patch((address)&insn, 25, 20, shamt);               \
    patch((address)&insn, 31, 26, funct6);              \
    patch_reg((address)&insn, 7, Rd);                   \
    patch_reg((address)&insn, 15, Rs1);                 \
    emit(insn);                                         \
  }

  INSN(rori,    0b0010011, 0b101, 0b011000);
  INSN(slli_uw, 0b0011011, 0b001, 0b000010);
  INSN(bexti,   0b0010011, 0b101, 0b010010);

#undef INSN

#define INSN(NAME, op, funct3, funct7)                  \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {\
    guarantee(shamt <= 0x1f, "Shamt is invalid");       \
    unsigned insn = 0;                                  \
    patch((address)&insn, 6, 0, op);                    \
    patch((address)&insn, 14, 12, funct3);              \
    patch((address)&insn, 24, 20, shamt);               \
    patch((address)&insn, 31, 25, funct7);              \
    patch_reg((address)&insn, 7, Rd);                   \
    patch_reg((address)&insn, 15, Rs1);                 \
    emit(insn);                                         \
  }

  INSN(roriw, 0b0011011, 0b101, 0b0110000);

#undef INSN

// ========================================
// RISC-V Compressed Instructions Extension
// ========================================
// Note:
// 1. Assembler functions encoding 16-bit compressed instructions always begin with a 'c_'
//    prefix, such as 'c_add'. Correspondingly, assembler functions encoding normal 32-bit
//    instructions with begin with a '_' prefix, such as "_add". Most of time users have no
//    need to explicitly emit these compressed instructions. Instead, they still use unified
//    wrappers such as 'add' which do the compressing work through 'c_add' depending on the
//    the operands of the instruction and availability of the RVC hardware extension.
//
// 2. 'CompressibleScope' and 'IncompressibleScope' are introduced to mark assembler scopes
//     within which instructions are qualified or unqualified to be compressed into their 16-bit
//     versions. An example:
//
//      CompressibleScope scope(_masm);
//      __ add(...);       // this instruction will be compressed into 'c.add' when possible
//      {
//         IncompressibleScope scope(_masm);
//         __ add(...);    // this instruction will not be compressed
//         {
//            CompressibleScope scope(_masm);
//            __ add(...); // this instruction will be compressed into 'c.add' when possible
//         }
//      }
//
// 3. When printing JIT assembly code, using -XX:PrintAssemblyOptions=no-aliases could help
//    distinguish compressed 16-bit instructions from normal 32-bit ones.

private:
  bool _in_compressible_scope;
public:
  bool in_compressible_scope() const { return _in_compressible_scope; }
  void set_in_compressible_scope(bool b) { _in_compressible_scope = b; }
public:

  // An abstract compressible scope
  class AbstractCompressibleScope : public StackObj {
  protected:
    Assembler *_masm;
    bool _saved_in_compressible_scope;
  protected:
    AbstractCompressibleScope(Assembler *_masm)
    : _masm(_masm)
    , _saved_in_compressible_scope(_masm->in_compressible_scope()) {}
  };
  // A compressible scope
  class CompressibleScope : public AbstractCompressibleScope {
  public:
    CompressibleScope(Assembler *_masm) : AbstractCompressibleScope(_masm) {
      _masm->set_in_compressible_scope(true);
    }
    ~CompressibleScope() {
      _masm->set_in_compressible_scope(_saved_in_compressible_scope);
    }
  };
  // An incompressible scope
  class IncompressibleScope : public AbstractCompressibleScope {
  public:
    IncompressibleScope(Assembler *_masm) : AbstractCompressibleScope(_masm) {
      _masm->set_in_compressible_scope(false);
    }
    ~IncompressibleScope() {
      _masm->set_in_compressible_scope(_saved_in_compressible_scope);
    }
  };

public:
  // Emit a relocation.
  void relocate(RelocationHolder const& rspec, int format = 0) {
    AbstractAssembler::relocate(rspec, format);
  }
  void relocate(relocInfo::relocType rtype, int format = 0) {
    AbstractAssembler::relocate(rtype, format);
  }
  template <typename Callback>
  void relocate(RelocationHolder const& rspec, Callback emit_insts, int format = 0) {
    AbstractAssembler::relocate(rspec, format);
    IncompressibleScope scope(this); // relocations
    emit_insts();
  }
  template <typename Callback>
  void relocate(relocInfo::relocType rtype, Callback emit_insts, int format = 0) {
    AbstractAssembler::relocate(rtype, format);
    IncompressibleScope scope(this); // relocations
    emit_insts();
  }

  // patch a 16-bit instruction.
  static void c_patch(address a, unsigned msb, unsigned lsb, uint16_t val) {
    assert_cond(a != nullptr);
    assert_cond(msb >= lsb && msb <= 15);
    unsigned nbits = msb - lsb + 1;
    guarantee(val < (1U << nbits), "Field too big for insn");
    uint16_t mask = (1U << nbits) - 1;
    val <<= lsb;
    mask <<= lsb;
    uint16_t target = ld_c_instr(a);
    target &= ~mask;
    target |= val;
    sd_c_instr(a, target);
  }

  static void c_patch(address a, unsigned bit, uint16_t val) {
    c_patch(a, bit, bit, val);
  }

  // patch a 16-bit instruction with a general purpose register ranging [0, 31] (5 bits)
  static void c_patch_reg(address a, unsigned lsb, Register reg) {
    c_patch(a, lsb + 4, lsb, reg->raw_encoding());
  }

  // patch a 16-bit instruction with a general purpose register ranging [8, 15] (3 bits)
  static void c_patch_compressed_reg(address a, unsigned lsb, Register reg) {
    c_patch(a, lsb + 2, lsb, reg->compressed_raw_encoding());
  }

  // patch a 16-bit instruction with a float register ranging [0, 31] (5 bits)
  static void c_patch_reg(address a, unsigned lsb, FloatRegister reg) {
    c_patch(a, lsb + 4, lsb, reg->raw_encoding());
  }

  // patch a 16-bit instruction with a float register ranging [8, 15] (3 bits)
  static void c_patch_compressed_reg(address a, unsigned lsb, FloatRegister reg) {
    c_patch(a, lsb + 2, lsb, reg->compressed_raw_encoding());
  }

// --------------  RVC Instruction Definitions  --------------

  void c_nop() {
    c_addi(x0, 0);
  }

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd_Rs1, int64_t imm) {                                                  \
    assert_cond(is_simm6(imm));                                                              \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (imm & right_n_bits(5)));                                  \
    c_patch_reg((address)&insn, 7, Rd_Rs1);                                                  \
    c_patch((address)&insn, 12, 12, (imm & nth_bit(5)) >> 5);                                \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_addi,   0b000, 0b01);
  INSN(c_addiw,  0b001, 0b01);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(int64_t imm) {                                                                   \
    assert_cond(is_simm10(imm));                                                             \
    assert_cond((imm & 0b1111) == 0);                                                        \
    assert_cond(imm != 0);                                                                   \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 2, 2, (imm & nth_bit(5)) >> 5);                                  \
    c_patch((address)&insn, 4, 3, (imm & right_n_bits(9)) >> 7);                             \
    c_patch((address)&insn, 5, 5, (imm & nth_bit(6)) >> 6);                                  \
    c_patch((address)&insn, 6, 6, (imm & nth_bit(4)) >> 4);                                  \
    c_patch_reg((address)&insn, 7, sp);                                                      \
    c_patch((address)&insn, 12, 12, (imm & nth_bit(9)) >> 9);                                \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_addi16sp, 0b011, 0b01);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd, uint64_t uimm) {                                                    \
    assert_cond(is_uimm10(uimm));                                                            \
    assert_cond((uimm & 0b11) == 0);                                                         \
    assert_cond(uimm != 0);                                                                  \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_compressed_reg((address)&insn, 2, Rd);                                           \
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(3)) >> 3);                                 \
    c_patch((address)&insn, 6, 6, (uimm & nth_bit(2)) >> 2);                                 \
    c_patch((address)&insn, 10, 7, (uimm & right_n_bits(10)) >> 6);                          \
    c_patch((address)&insn, 12, 11, (uimm & right_n_bits(6)) >> 4);                          \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_addi4spn, 0b000, 0b00);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd_Rs1, uint32_t shamt) {                                               \
    assert_cond(is_uimm6(shamt));                                                            \
    assert_cond(shamt != 0);                                                                 \
    assert_cond(Rd_Rs1 != x0);                                                               \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (shamt & right_n_bits(5)));                                \
    c_patch_reg((address)&insn, 7, Rd_Rs1);                                                  \
    c_patch((address)&insn, 12, 12, (shamt & nth_bit(5)) >> 5);                              \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_slli, 0b000, 0b10);

#undef INSN

#define INSN(NAME, funct3, funct2, op)                                                       \
  void NAME(Register Rd_Rs1, uint32_t shamt) {                                               \
    assert_cond(is_uimm6(shamt));                                                            \
    assert_cond(shamt != 0);                                                                 \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (shamt & right_n_bits(5)));                                \
    c_patch_compressed_reg((address)&insn, 7, Rd_Rs1);                                       \
    c_patch((address)&insn, 11, 10, funct2);                                                 \
    c_patch((address)&insn, 12, 12, (shamt & nth_bit(5)) >> 5);                              \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_srli, 0b100, 0b00, 0b01);
  INSN(c_srai, 0b100, 0b01, 0b01);

#undef INSN

#define INSN(NAME, funct3, funct2, op)                                                       \
  void NAME(Register Rd_Rs1, int64_t imm) {                                                  \
    assert_cond(is_simm6(imm));                                                              \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (imm & right_n_bits(5)));                                  \
    c_patch_compressed_reg((address)&insn, 7, Rd_Rs1);                                       \
    c_patch((address)&insn, 11, 10, funct2);                                                 \
    c_patch((address)&insn, 12, 12, (imm & nth_bit(5)) >> 5);                                \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_andi, 0b100, 0b10, 0b01);

#undef INSN

#define INSN(NAME, funct6, funct2, op)                                                       \
  void NAME(Register Rd_Rs1, Register Rs2) {                                                 \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_compressed_reg((address)&insn, 2, Rs2);                                          \
    c_patch((address)&insn, 6, 5, funct2);                                                   \
    c_patch_compressed_reg((address)&insn, 7, Rd_Rs1);                                       \
    c_patch((address)&insn, 15, 10, funct6);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_sub,  0b100011, 0b00, 0b01);
  INSN(c_xor,  0b100011, 0b01, 0b01);
  INSN(c_or,   0b100011, 0b10, 0b01);
  INSN(c_and,  0b100011, 0b11, 0b01);
  INSN(c_subw, 0b100111, 0b00, 0b01);
  INSN(c_addw, 0b100111, 0b01, 0b01);

#undef INSN

#define INSN(NAME, funct4, op)                                                               \
  void NAME(Register Rd_Rs1, Register Rs2) {                                                 \
    assert_cond(Rd_Rs1 != x0);                                                               \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_reg((address)&insn, 2, Rs2);                                                     \
    c_patch_reg((address)&insn, 7, Rd_Rs1);                                                  \
    c_patch((address)&insn, 15, 12, funct4);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_mv,  0b1000, 0b10);
  INSN(c_add, 0b1001, 0b10);

#undef INSN

 private:
  // All calls and jumps must go via MASM.
  // Format CR, c.jr/c.jalr
  // Note C instruction can't be changed, i.e. relocation patching.
  template <uint8_t InstructionType, uint8_t FunctionType>
  void c_cr_if(Register Rs1) {
    assert_cond(Rs1 != x0);
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, FunctionType);
    c_patch_reg((address)&insn, 2, x0);
    c_patch_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 15, 12, InstructionType);
    emit_int16(insn);
  }

  void c_jr(Register Rs1)   { c_cr_if<0b1000, 0b10>(Rs1); }
  void c_jalr(Register Rs1) { c_cr_if<0b1001, 0b10>(Rs1); }

  typedef void (Assembler::* j_c_insn)(address dest);
  typedef void (Assembler::* compare_and_branch_c_insn)(Register Rs1, address dest);

  void wrap_label(Label &L, j_c_insn insn) {
    if (L.is_bound()) {
      (this->*insn)(target(L));
    } else {
      L.add_patch_at(code(), locator());
      (this->*insn)(pc());
    }
  }

  void wrap_label(Label &L, Register r, compare_and_branch_c_insn insn) {
    if (L.is_bound()) {
      (this->*insn)(r, target(L));
    } else {
      L.add_patch_at(code(), locator());
      (this->*insn)(r, pc());
    }
  }

  // Format CJ, c.j (c.jal)
  // Note C instruction can't be changed, i.e. relocation patching.
  void c_j(int32_t offset) {
    assert(is_simm12(offset) && ((offset % 2) == 0), "invalid encoding");
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b01);
    c_patch((address)&insn, 2, 2, (offset & nth_bit(5)) >> 5);
    c_patch((address)&insn, 5, 3, (offset & right_n_bits(4)) >> 1);
    c_patch((address)&insn, 6, 6, (offset & nth_bit(7)) >> 7);
    c_patch((address)&insn, 7, 7, (offset & nth_bit(6)) >> 6);
    c_patch((address)&insn, 8, 8, (offset & nth_bit(10)) >> 10);
    c_patch((address)&insn, 10, 9, (offset & right_n_bits(10)) >> 8);
    c_patch((address)&insn, 11, 11, (offset & nth_bit(4)) >> 4);
    c_patch((address)&insn, 12, 12, (offset & nth_bit(11)) >> 11);
    c_patch((address)&insn, 15, 13, 0b101);
    emit_int16(insn);
  }

  void c_j(address dest) {
    assert_cond(dest != nullptr);
    int64_t distance = dest - pc();
    assert(is_simm12(distance) && ((distance % 2) == 0), "invalid encoding");
    c_j(distance);
  }

  void c_j(Label &L) {
    wrap_label(L, &Assembler::c_j);
  }

  public:

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rs1, int32_t imm) {                                                     \
    assert(is_simm9(imm) && ((imm % 2) == 0), "invalid encoding");                           \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 2, 2, (imm & nth_bit(5)) >> 5);                                  \
    c_patch((address)&insn, 4, 3, (imm & right_n_bits(3)) >> 1);                             \
    c_patch((address)&insn, 6, 5, (imm & right_n_bits(8)) >> 6);                             \
    c_patch_compressed_reg((address)&insn, 7, Rs1);                                          \
    c_patch((address)&insn, 11, 10, (imm & right_n_bits(5)) >> 3);                           \
    c_patch((address)&insn, 12, 12, (imm & nth_bit(8)) >> 8);                                \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }                                                                                          \
  void NAME(Register Rs1, address dest) {                                                    \
    assert_cond(dest != nullptr);                                                            \
    int64_t distance = dest - pc();                                                          \
    assert(is_simm9(distance) && ((distance % 2) == 0), "invalid encoding");                 \
    NAME(Rs1, distance);                                                                     \
  }                                                                                          \
  void NAME(Register Rs1, Label &L) {                                                        \
    wrap_label(L, Rs1, &Assembler::NAME);                                                    \
  }

  INSN(c_beqz, 0b110, 0b01);
  INSN(c_bnez, 0b111, 0b01);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd, int32_t imm) {                                                      \
    assert_cond(is_simm18(imm));                                                             \
    assert_cond((imm & 0xfff) == 0);                                                         \
    assert_cond(imm != 0);                                                                   \
    assert_cond(Rd != x0 && Rd != x2);                                                       \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (imm & right_n_bits(17)) >> 12);                           \
    c_patch_reg((address)&insn, 7, Rd);                                                      \
    c_patch((address)&insn, 12, 12, (imm & nth_bit(17)) >> 17);                              \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_lui, 0b011, 0b01);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd, int32_t imm) {                                                      \
    assert_cond(is_simm6(imm));                                                              \
    assert_cond(Rd != x0);                                                                   \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 6, 2, (imm & right_n_bits(5)));                                  \
    c_patch_reg((address)&insn, 7, Rd);                                                      \
    c_patch((address)&insn, 12, 12, (imm & right_n_bits(6)) >> 5);                           \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_li, 0b010, 0b01);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd, uint32_t uimm) {                                                    \
    assert_cond(is_uimm9(uimm));                                                             \
    assert_cond((uimm & 0b111) == 0);                                                        \
    assert_cond(Rd != x0);                                                                   \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 4, 2, (uimm & right_n_bits(9)) >> 6);                            \
    c_patch((address)&insn, 6, 5, (uimm & right_n_bits(5)) >> 3);                            \
    c_patch_reg((address)&insn, 7, Rd);                                                      \
    c_patch((address)&insn, 12, 12, (uimm & nth_bit(5)) >> 5);                               \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_ldsp,  0b011, 0b10);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(FloatRegister Rd, uint32_t uimm) {                                               \
    assert_cond(is_uimm9(uimm));                                                             \
    assert_cond((uimm & 0b111) == 0);                                                        \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 4, 2, (uimm & right_n_bits(9)) >> 6);                            \
    c_patch((address)&insn, 6, 5, (uimm & right_n_bits(5)) >> 3);                            \
    c_patch_reg((address)&insn, 7, Rd);                                                      \
    c_patch((address)&insn, 12, 12, (uimm & nth_bit(5)) >> 5);                               \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_fldsp, 0b001, 0b10);

#undef INSN

#define INSN(NAME, funct3, op, REGISTER_TYPE)                                                \
  void NAME(REGISTER_TYPE Rd_Rs2, Register Rs1, uint32_t uimm) {                             \
    assert_cond(is_uimm8(uimm));                                                             \
    assert_cond((uimm & 0b111) == 0);                                                        \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);                                       \
    c_patch((address)&insn, 6, 5, (uimm & right_n_bits(8)) >> 6);                            \
    c_patch_compressed_reg((address)&insn, 7, Rs1);                                          \
    c_patch((address)&insn, 12, 10, (uimm & right_n_bits(6)) >> 3);                          \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_ld,  0b011, 0b00, Register);
  INSN(c_sd,  0b111, 0b00, Register);
  INSN(c_fld, 0b001, 0b00, FloatRegister);
  INSN(c_fsd, 0b101, 0b00, FloatRegister);

#undef INSN

#define INSN(NAME, funct3, op, REGISTER_TYPE)                                                \
  void NAME(REGISTER_TYPE Rs2, uint32_t uimm) {                                              \
    assert_cond(is_uimm9(uimm));                                                             \
    assert_cond((uimm & 0b111) == 0);                                                        \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_reg((address)&insn, 2, Rs2);                                                     \
    c_patch((address)&insn, 9, 7, (uimm & right_n_bits(9)) >> 6);                            \
    c_patch((address)&insn, 12, 10, (uimm & right_n_bits(6)) >> 3);                          \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_sdsp,  0b111, 0b10, Register);
  INSN(c_fsdsp, 0b101, 0b10, FloatRegister);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rs2, uint32_t uimm) {                                                   \
    assert_cond(is_uimm8(uimm));                                                             \
    assert_cond((uimm & 0b11) == 0);                                                         \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_reg((address)&insn, 2, Rs2);                                                     \
    c_patch((address)&insn, 8, 7, (uimm & right_n_bits(8)) >> 6);                            \
    c_patch((address)&insn, 12, 9, (uimm & right_n_bits(6)) >> 2);                           \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_swsp, 0b110, 0b10);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd, uint32_t uimm) {                                                    \
    assert_cond(is_uimm8(uimm));                                                             \
    assert_cond((uimm & 0b11) == 0);                                                         \
    assert_cond(Rd != x0);                                                                   \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 3, 2, (uimm & right_n_bits(8)) >> 6);                            \
    c_patch((address)&insn, 6, 4, (uimm & right_n_bits(5)) >> 2);                            \
    c_patch_reg((address)&insn, 7, Rd);                                                      \
    c_patch((address)&insn, 12, 12, (uimm & nth_bit(5)) >> 5);                               \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_lwsp, 0b010, 0b10);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME(Register Rd_Rs2, Register Rs1, uint32_t uimm) {                                  \
    assert_cond(is_uimm7(uimm));                                                             \
    assert_cond((uimm & 0b11) == 0);                                                         \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);                                       \
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(6)) >> 6);                                 \
    c_patch((address)&insn, 6, 6, (uimm & nth_bit(2)) >> 2);                                 \
    c_patch_compressed_reg((address)&insn, 7, Rs1);                                          \
    c_patch((address)&insn, 12, 10, (uimm & right_n_bits(6)) >> 3);                          \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_lw, 0b010, 0b00);
  INSN(c_sw, 0b110, 0b00);

#undef INSN

#define INSN(NAME, funct3, op)                                                               \
  void NAME() {                                                                              \
    uint16_t insn = 0;                                                                       \
    c_patch((address)&insn, 1, 0, op);                                                       \
    c_patch((address)&insn, 11, 2, 0x0);                                                     \
    c_patch((address)&insn, 12, 12, 0b1);                                                    \
    c_patch((address)&insn, 15, 13, funct3);                                                 \
    emit_int16(insn);                                                                        \
  }

  INSN(c_ebreak, 0b100, 0b10);

#undef INSN

// --------------  RVC Transformation Functions  --------------

// --------------------------
// Register instructions
// --------------------------
#define INSN(NAME)                                                                             \
  void NAME(Register Rd, Register Rs1, Register Rs2) {                                         \
    /* add -> c.add */                                                                         \
    if (do_compress()) {                                                                       \
      Register src = noreg;                                                                    \
      if (Rs1 != x0 && Rs2 != x0 && ((src = Rs1, Rs2 == Rd) || (src = Rs2, Rs1 == Rd))) {      \
        c_add(Rd, src);                                                                        \
        return;                                                                                \
      }                                                                                        \
    }                                                                                          \
    _add(Rd, Rs1, Rs2);                                                                        \
  }

  INSN(add);

#undef INSN

// --------------------------
#define INSN(NAME, C_NAME, NORMAL_NAME)                                                      \
  void NAME(Register Rd, Register Rs1, Register Rs2) {                                       \
    /* sub/subw -> c.sub/c.subw */                                                           \
    if (do_compress() &&                                                                     \
        (Rd == Rs1 && Rd->is_compressed_valid() && Rs2->is_compressed_valid())) {            \
      C_NAME(Rd, Rs2);                                                                       \
      return;                                                                                \
    }                                                                                        \
    NORMAL_NAME(Rd, Rs1, Rs2);                                                               \
  }

  INSN(sub,  c_sub,  _sub);
  INSN(subw, c_subw, _subw);

#undef INSN

// --------------------------
#define INSN(NAME, C_NAME, NORMAL_NAME)                                                      \
  void NAME(Register Rd, Register Rs1, Register Rs2) {                                       \
    /* and/or/xor/addw -> c.and/c.or/c.xor/c.addw */                                         \
    if (do_compress()) {                                                                     \
      Register src = noreg;                                                                  \
      if (Rs1->is_compressed_valid() && Rs2->is_compressed_valid() &&                        \
        ((src = Rs1, Rs2 == Rd) || (src = Rs2, Rs1 == Rd))) {                                \
        C_NAME(Rd, src);                                                                     \
        return;                                                                              \
      }                                                                                      \
    }                                                                                        \
    NORMAL_NAME(Rd, Rs1, Rs2);                                                               \
  }

  INSN(andr, c_and,  _andr);
  INSN(orr,  c_or,   _orr);
  INSN(xorr, c_xor,  _xorr);
  INSN(addw, c_addw, _addw);

#undef INSN

private:
// some helper functions
#define FUNC(NAME, funct3, bits)                                                             \
  bool NAME(Register rs1, Register rd_rs2, int32_t imm12, bool ld) {                         \
    return rs1 == sp &&                                                                      \
      is_uimm(imm12, bits) &&                                                                \
      (intx(imm12) & funct3) == 0x0 &&                                                       \
      (!ld || rd_rs2 != x0);                                                                 \
  }                                                                                          \

  FUNC(is_c_ldsdsp,  0b111, 9);
  FUNC(is_c_lwswsp,  0b011, 8);

#undef FUNC

#define FUNC(NAME, funct3, bits)                                                             \
  bool NAME(Register rs1, int32_t imm12) {                                                   \
    return rs1 == sp &&                                                                      \
      is_uimm(imm12, bits) &&                                                                \
      (intx(imm12) & funct3) == 0x0;                                                         \
  }                                                                                          \

  FUNC(is_c_fldsdsp, 0b111, 9);

#undef FUNC

#define FUNC(NAME, REG_TYPE, funct3, bits)                                                   \
  bool NAME(Register rs1, REG_TYPE rd_rs2, int32_t imm12) {                                  \
    return rs1->is_compressed_valid() &&                                                     \
      rd_rs2->is_compressed_valid() &&                                                       \
      is_uimm(imm12, bits) &&                                                                \
      (intx(imm12) & funct3) == 0x0;                                                         \
  }                                                                                          \

  FUNC(is_c_ldsd,  Register,      0b111, 8);
  FUNC(is_c_lwsw,  Register,      0b011, 7);
  FUNC(is_c_fldsd, FloatRegister, 0b111, 8);

#undef FUNC

public:
  bool do_compress() const {
    return UseRVC && in_compressible_scope();
  }

  bool do_compress_zcb(Register reg1 = noreg, Register reg2 = noreg) const {
    return do_compress() && UseZcb &&
           (reg1 == noreg || reg1->is_compressed_valid()) && (reg2 == noreg || reg2->is_compressed_valid());
  }

  bool do_compress_zcb_zbb(Register reg1 = noreg, Register reg2 = noreg) const {
    return do_compress_zcb(reg1, reg2) && UseZbb;
  }

// --------------------------
// Load/store register
// --------------------------
  void lw(Register Rd, Register Rs, const int32_t offset) {
    /* lw -> c.lwsp/c.lw */
    if (do_compress()) {
      if (is_c_lwswsp(Rs, Rd, offset, true)) {
        c_lwsp(Rd, offset);
        return;
      } else if (is_c_lwsw(Rs, Rd, offset)) {
        c_lw(Rd, Rs, offset);
        return;
      }
    }
    _lw(Rd, Rs, offset);
  }

// --------------------------
  void ld(Register Rd, Register Rs, const int32_t offset) {
    /* ld -> c.ldsp/c.ld */
    if (do_compress()) {
      if (is_c_ldsdsp(Rs, Rd, offset, true)) {
        c_ldsp(Rd, offset);
        return;
      } else if (is_c_ldsd(Rs, Rd, offset)) {
        c_ld(Rd, Rs, offset);
        return;
      }
    }
    _ld(Rd, Rs, offset);
  }

// --------------------------
  void fld(FloatRegister Rd, Register Rs, const int32_t offset) {
    /* fld -> c.fldsp/c.fld */
    if (do_compress()) {
      if (is_c_fldsdsp(Rs, offset)) {
        c_fldsp(Rd, offset);
        return;
      } else if (is_c_fldsd(Rs, Rd, offset)) {
        c_fld(Rd, Rs, offset);
        return;
      }
    }
    _fld(Rd, Rs, offset);
  }

// --------------------------
  void sd(Register Rs2, Register Rs1, const int32_t offset) {
    /* sd -> c.sdsp/c.sd */
    if (do_compress()) {
      if (is_c_ldsdsp(Rs1, Rs2, offset, false)) {
        c_sdsp(Rs2, offset);
        return;
      } else if (is_c_ldsd(Rs1, Rs2, offset)) {
        c_sd(Rs2, Rs1, offset);
        return;
      }
    }
    _sd(Rs2, Rs1, offset);
  }

// --------------------------
  void sw(Register Rs2, Register Rs1, const int32_t offset) {
    /* sw -> c.swsp/c.sw */
    if (do_compress()) {
      if (is_c_lwswsp(Rs1, Rs2, offset, false)) {
        c_swsp(Rs2, offset);
        return;
      } else if (is_c_lwsw(Rs1, Rs2, offset)) {
        c_sw(Rs2, Rs1, offset);
        return;
      }
    }
    _sw(Rs2, Rs1, offset);
  }

// --------------------------
  void fsd(FloatRegister Rs2, Register Rs1, const int32_t offset) {
    /* fsd -> c.fsdsp/c.fsd */
    if (do_compress()) {
      if (is_c_fldsdsp(Rs1, offset)) {
        c_fsdsp(Rs2, offset);
        return;
      } else if (is_c_fldsd(Rs1, Rs2, offset)) {
        c_fsd(Rs2, Rs1, offset);
        return;
      }
    }
    _fsd(Rs2, Rs1, offset);
  }

// --------------------------
// Unconditional branch instructions
// --------------------------
 protected:
  // All calls and jumps must go via MASM. Only use x1 (aka ra) as link register for now.
  void jalr(Register Rd, Register Rs, const int32_t offset) {
    assert(Rd != x5 && Rs != x5, "Register x5 must not be used for calls/jumps.");
    /* jalr -> c.jr/c.jalr */
    if (do_compress() && (offset == 0 && Rs != x0)) {
      if (Rd == x1) {
        c_jalr(Rs);
        return;
      } else if (Rd == x0) {
        c_jr(Rs);
        return;
      }
    }
    _jalr(Rd, Rs, offset);
  }

  void jal(Register Rd, const int32_t offset) {
    assert(Rd != x5, "Register x5 must not be used for calls/jumps.");
    /* jal -> c.j, note c.jal is RV32C only */
    if (do_compress() &&
        Rd == x0 &&
        is_simm12(offset) && ((offset % 2) == 0)) {
      c_j(offset);
      return;
    }
    _jal(Rd, offset);
  }

  public:

// --------------------------
// Miscellaneous Instructions
// --------------------------
#define INSN(NAME)                                                     \
  void NAME() {                                                        \
    /* ebreak -> c.ebreak */                                           \
    if (do_compress()) {                                               \
      c_ebreak();                                                      \
      return;                                                          \
    }                                                                  \
    _ebreak();                                                         \
  }

  INSN(ebreak);

#undef INSN

// --------------------------
// Immediate Instructions
// --------------------------
#define INSN(NAME)                                                                           \
  void NAME(Register Rd, Register Rs1, int64_t imm) {                                        \
    /* addi -> c.addi/c.nop/c.mv/c.addi16sp/c.addi4spn */                                    \
    if (do_compress()) {                                                                     \
      if (Rd == Rs1 && is_simm6(imm)) {                                                      \
        c_addi(Rd, imm);                                                                     \
        return;                                                                              \
      } else if (imm == 0 && Rd != x0 && Rs1 != x0) {                                        \
        c_mv(Rd, Rs1);                                                                       \
        return;                                                                              \
      } else if (Rs1 == sp && imm != 0) {                                                    \
        if (Rd == Rs1 && (imm & 0b1111) == 0x0 && is_simm10(imm)) {                          \
          c_addi16sp(imm);                                                                   \
          return;                                                                            \
        } else if (Rd->is_compressed_valid() && (imm & 0b11) == 0x0 && is_uimm10(imm)) {     \
          c_addi4spn(Rd, imm);                                                               \
          return;                                                                            \
        }                                                                                    \
      }                                                                                      \
    }                                                                                        \
    _addi(Rd, Rs1, imm);                                                                     \
  }

  INSN(addi);

#undef INSN

// --------------------------
#define INSN(NAME)                                                                           \
  void NAME(Register Rd, Register Rs1, int64_t imm) {                                        \
    /* addiw -> c.addiw */                                                                   \
    if (do_compress() && (Rd == Rs1 && Rd != x0 && is_simm6(imm))) {                         \
      c_addiw(Rd, imm);                                                                      \
      return;                                                                                \
    }                                                                                        \
    _addiw(Rd, Rs1, imm);                                                                    \
  }

  INSN(addiw);

#undef INSN

// --------------------------
#define INSN(NAME)                                                                           \
  void NAME(Register Rd, Register Rs1, int64_t imm) {                                        \
    /* andi -> c.andi */                                                                     \
    if (do_compress() &&                                                                     \
        (Rd == Rs1 && Rd->is_compressed_valid() && is_simm6(imm))) {                         \
      c_andi(Rd, imm);                                                                       \
      return;                                                                                \
    }                                                                                        \
    _andi(Rd, Rs1, imm);                                                                     \
  }

  INSN(andi);

#undef INSN

// --------------------------
// Shift Immediate Instructions
// --------------------------
#define INSN(NAME)                                                                           \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {                                     \
    /* slli -> c.slli */                                                                     \
    if (do_compress() && (Rd == Rs1 && Rd != x0 && shamt != 0)) {                            \
      c_slli(Rd, shamt);                                                                     \
      return;                                                                                \
    }                                                                                        \
    if (shamt != 0) {                                                                        \
      _slli(Rd, Rs1, shamt);                                                                 \
    } else {                                                                                 \
      if (Rd != Rs1) {                                                                       \
        addi(Rd, Rs1, 0);                                                                    \
      }                                                                                      \
    }                                                                                        \
  }

  INSN(slli);

#undef INSN

// --------------------------
#define INSN(NAME, C_NAME, NORMAL_NAME)                                                      \
  void NAME(Register Rd, Register Rs1, unsigned shamt) {                                     \
    /* srai/srli -> c.srai/c.srli */                                                         \
    if (do_compress() && (Rd == Rs1 && Rd->is_compressed_valid() && shamt != 0)) {           \
      C_NAME(Rd, shamt);                                                                     \
      return;                                                                                \
    }                                                                                        \
    if (shamt != 0) {                                                                        \
      NORMAL_NAME(Rd, Rs1, shamt);                                                           \
    } else {                                                                                 \
      if (Rd != Rs1) {                                                                       \
        addi(Rd, Rs1, 0);                                                                    \
      }                                                                                      \
    }                                                                                        \
  }

  INSN(srai, c_srai, _srai);
  INSN(srli, c_srli, _srli);

#undef INSN

// --------------------------
// Upper Immediate Instruction
// --------------------------
#define INSN(NAME)                                                                           \
  void NAME(Register Rd, int32_t imm) {                                                      \
    /* lui -> c.lui */                                                                       \
    if (do_compress() && (Rd != x0 && Rd != x2 && imm != 0 && is_simm18(imm))) {             \
      c_lui(Rd, imm);                                                                        \
      return;                                                                                \
    }                                                                                        \
    _lui(Rd, imm);                                                                           \
  }

  INSN(lui);

#undef INSN

// Cache Management Operations
// These instruction may be turned off for user space.
 private:
  enum CBO_FUNCT : unsigned int {
    CBO_INVAL = 0b0000000000000,
    CBO_CLEAN = 0b0000000000001,
    CBO_FLUSH = 0b0000000000010,
    CBO_ZERO  = 0b0000000000100
  };

  template <CBO_FUNCT FUNCT>
  void cbo_base(Register Rs1) {
    assert((UseZicbom && FUNCT != CBO_ZERO) || UseZicboz, "sanity");
    unsigned insn = 0;
    patch((address)&insn, 6,  0, 0b0001111);
    patch((address)&insn, 14, 12, 0b010);
    patch_reg((address)&insn, 15, Rs1);
    patch((address)&insn, 31, 20, FUNCT);
    emit(insn);
  }

  // This instruction have some security implication.
  // At this time it's not likely to be enabled for user mode.
  void cbo_inval(Register Rs1) { cbo_base<CBO_INVAL>(Rs1); }
 public:
  // Zicbom
  void cbo_clean(Register Rs1) { cbo_base<CBO_CLEAN>(Rs1); }
  void cbo_flush(Register Rs1) { cbo_base<CBO_FLUSH>(Rs1); }
  // Zicboz
  void cbo_zero(Register Rs1)  { cbo_base<CBO_ZERO>(Rs1); }

 private:
  enum PREFETCH_FUNCT : unsigned int {
    PREFETCH_I = 0b0000000000000,
    PREFETCH_R = 0b0000000000001,
    PREFETCH_W = 0b0000000000011
  };

  template <PREFETCH_FUNCT FUNCT>
  void prefetch_base(Register Rs1, int32_t offset) {
    assert_cond(UseZicbop);
    guarantee((offset & 0x1f) == 0, "offset lowest 5 bits must be zero");
    int32_t upperOffset = offset >> 5;
    unsigned insn = 0;
    patch((address)&insn, 6,  0, 0b0010011);
    patch((address)&insn, 14, 12, 0b110);
    patch_reg((address)&insn, 15, Rs1);
    patch((address)&insn, 24, 20, FUNCT);
    upperOffset &= 0x7f;
    patch((address)&insn, 31, 25, upperOffset);
    emit(insn);
  }

 public:
  // Zicbop
  void prefetch_i(Register Rs1, int32_t offset) { prefetch_base<PREFETCH_I>(Rs1, offset); }
  void prefetch_r(Register Rs1, int32_t offset) { prefetch_base<PREFETCH_R>(Rs1, offset); }
  void prefetch_w(Register Rs1, int32_t offset) { prefetch_base<PREFETCH_W>(Rs1, offset); }

// --------------  Zicond Instruction Definitions  --------------
// Zicond conditional operations extension
  private:
  enum CZERO_OP : unsigned int {
    CZERO_NEZ = 0b111,
    CZERO_EQZ = 0b101
  };

  template <CZERO_OP OP_VALUE>
  void czero(Register Rd, Register Rs1, Register Rs2) {
    assert_cond(UseZicond);
    uint32_t insn = 0;
    patch    ((address)&insn,  6,  0, 0b0110011);  // bits:  7, name: 0x33, attr: ['OP']
    patch_reg((address)&insn,      7, Rd);         // bits:  5, name: 'rd'
    patch    ((address)&insn, 14, 12, OP_VALUE);   // bits:  3, name: 0x7, attr: ['CZERO.NEZ'] / 0x5, attr: ['CZERO.EQZ']}
    patch_reg((address)&insn,     15, Rs1);        // bits:  5, name: 'rs1', attr: ['value']
    patch_reg((address)&insn,     20, Rs2);        // bits:  5, name: 'rs2', attr: ['condition']
    patch    ((address)&insn, 31, 25, 0b0000111);  // bits:  7, name: 0x7, attr: ['CZERO']
    emit_int32(insn);
  }

  public:
  // Moves zero to a register rd, if the condition rs2 is equal to zero, otherwise moves rs1 to rd.
  void czero_eqz(Register rd, Register rs1_value, Register rs2_condition) {
    czero<CZERO_EQZ>(rd, rs1_value, rs2_condition);
  }

  // Moves zero to a register rd, if the condition rs2 is nonzero, otherwise moves rs1 to rd.
  void czero_nez(Register rd, Register rs1_value, Register rs2_condition) {
    czero<CZERO_NEZ>(rd, rs1_value, rs2_condition);
  }

// --------------  ZCB Instruction Definitions  --------------
// Zcb additional C instructions
 private:
  // Format CLH, c.lh/c.lhu
  template <bool Unsigned>
  void c_lh_if(Register Rd_Rs2, Register Rs1, uint32_t uimm) {
    assert_cond(uimm == 0 || uimm == 2);
    assert_cond(do_compress_zcb(Rd_Rs2, Rs1));
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b00);
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(1)) >> 1);
    c_patch((address)&insn, 6, 6, Unsigned ? 0 : 1);
    c_patch_compressed_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 12, 10, 0b001);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

  template <bool Unsigned>
  void lh_c_mux(Register Rd_Rs2, Register Rs1, const int32_t uimm) {
    if (do_compress_zcb(Rd_Rs2, Rs1) &&
        (uimm == 0 || uimm == 2)) {
      c_lh_if<Unsigned>(Rd_Rs2, Rs1, uimm);
    } else {
      if (Unsigned) {
        _lhu(Rd_Rs2, Rs1, uimm);
      } else {
        _lh(Rd_Rs2, Rs1, uimm);
      }
    }
  }

  // Format CU, c.[sz]ext.*, c.not
  template <uint8_t InstructionType>
  void c_u_if(Register Rs1) {
    assert_cond(do_compress_zcb(Rs1));
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b01);
    c_patch((address)&insn, 4, 2, InstructionType);
    c_patch((address)&insn, 6, 5, 0b11);
    c_patch_compressed_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 12, 10, 0b111);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

 public:

  // Prerequisites: Zcb
  void c_lh(Register Rd_Rs2, Register Rs1, const int32_t uimm)  {  c_lh_if<false>(Rd_Rs2, Rs1, uimm); }
  void lh(Register Rd_Rs2, Register Rs1, const int32_t uimm)    { lh_c_mux<false>(Rd_Rs2, Rs1, uimm); }

  // Prerequisites: Zcb
  void c_lhu(Register Rd_Rs2, Register Rs1, const int32_t uimm) {  c_lh_if<true>(Rd_Rs2, Rs1, uimm); }
  void lhu(Register Rd_Rs2, Register Rs1, const int32_t uimm)   { lh_c_mux<true>(Rd_Rs2, Rs1, uimm); }

  // Prerequisites: Zcb
  // Format CLB, single instruction
  void c_lbu(Register Rd_Rs2, Register Rs1, uint32_t uimm) {
    assert_cond(uimm <= 3);
    assert_cond(do_compress_zcb(Rd_Rs2, Rs1));
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b00);
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(1)) >> 1);
    c_patch((address)&insn, 6, 6, (uimm & nth_bit(0)) >> 0);
    c_patch_compressed_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 12, 10, 0b000);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

  void lbu(Register Rd_Rs2, Register Rs1, const int32_t uimm) {
    if (do_compress_zcb(Rd_Rs2, Rs1) &&
        uimm >= 0 && uimm <= 3) {
      c_lbu(Rd_Rs2, Rs1, uimm);
    } else {
      _lbu(Rd_Rs2, Rs1, uimm);
    }
  }

  // Prerequisites: Zcb
  // Format CSB, single instruction
  void c_sb(Register Rd_Rs2, Register Rs1, uint32_t uimm) {
    assert_cond(uimm <= 3);
    assert_cond(do_compress_zcb(Rd_Rs2, Rs1));
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b00);
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(1)) >> 1);
    c_patch((address)&insn, 6, 6, (uimm & nth_bit(0)) >> 0);
    c_patch_compressed_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 12, 10, 0b010);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

  void sb(Register Rd_Rs2, Register Rs1, const int32_t uimm) {
    if (do_compress_zcb(Rd_Rs2, Rs1) &&
        uimm >= 0 && uimm <= 3) {
      c_sb(Rd_Rs2, Rs1, uimm);
    } else {
      _sb(Rd_Rs2, Rs1, uimm);
    }
  }

  // Prerequisites: Zcb
  // Format CSH, single instruction
  void c_sh(Register Rd_Rs2, Register Rs1, uint32_t uimm) {
    assert_cond(uimm == 0 || uimm == 2);
    assert_cond(do_compress_zcb(Rd_Rs2, Rs1));
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b00);
    c_patch_compressed_reg((address)&insn, 2, Rd_Rs2);
    c_patch((address)&insn, 5, 5, (uimm & nth_bit(1)) >> 1);
    c_patch((address)&insn, 6, 6, 0);
    c_patch_compressed_reg((address)&insn, 7, Rs1);
    c_patch((address)&insn, 12, 10, 0b011);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

  void sh(Register Rd_Rs2, Register Rs1, const int32_t uimm) {
    if (do_compress_zcb(Rd_Rs2, Rs1) &&
        (uimm == 0 || uimm == 2)) {
      c_sh(Rd_Rs2, Rs1, uimm);
    } else {
      _sh(Rd_Rs2, Rs1, uimm);
    }
  }

  // Prerequisites: Zcb
  // Format CS
  void c_zext_b(Register Rs1) {
    assert_cond(do_compress_zcb(Rs1));
    c_u_if<0b000>(Rs1);
  }

  // Prerequisites: Zbb
  void sext_b(Register Rd_Rs2, Register Rs1) {
    assert_cond(UseZbb);
    if (do_compress_zcb_zbb(Rd_Rs2, Rs1) && (Rd_Rs2 == Rs1)) {
      c_sext_b(Rd_Rs2);
    } else {
      _sext_b(Rd_Rs2, Rs1);
    }
  }

  // Prerequisites: Zcb, Zbb
  // Format CS
  void c_sext_b(Register Rs1) {
    c_u_if<0b001>(Rs1);
  }

  // Prerequisites: Zbb
  void zext_h(Register Rd_Rs2, Register Rs1) {
    assert_cond(UseZbb);
    if (do_compress_zcb_zbb(Rd_Rs2, Rs1) && (Rd_Rs2 == Rs1)) {
      c_zext_h(Rd_Rs2);
    } else {
      _zext_h(Rd_Rs2, Rs1);
    }
  }

  // Prerequisites: Zcb, Zbb
  // Format CS
  void c_zext_h(Register Rs1) {
    c_u_if<0b010>(Rs1);
  }

  // Prerequisites: Zbb
  void sext_h(Register Rd_Rs2, Register Rs1) {
    assert_cond(UseZbb);
    if (do_compress_zcb_zbb(Rd_Rs2, Rs1) && (Rd_Rs2 == Rs1)) {
      c_sext_h(Rd_Rs2);
    } else {
      _sext_h(Rd_Rs2, Rs1);
    }
  }

  // Prerequisites: Zcb, Zbb
  // Format CS
  void c_sext_h(Register Rs1) {
    c_u_if<0b011>(Rs1);
  }

  // Prerequisites: Zcb, Zba
  // Format CS
  void c_zext_w(Register Rs1) {
    c_u_if<0b100>(Rs1);
  }

  // Prerequisites: Zcb
  // Format CS
  void c_not(Register Rs1) {
    c_u_if<0b101>(Rs1);
  }

  // Prerequisites: Zcb (M or Zmmul)
  // Format CA, c.mul
  void c_mul(Register Rd_Rs1, Register Rs2) {
    uint16_t insn = 0;
    c_patch((address)&insn, 1, 0, 0b01);
    c_patch_compressed_reg((address)&insn, 2, Rs2);
    c_patch((address)&insn, 6, 5, 0b10);
    c_patch_compressed_reg((address)&insn, 7, Rd_Rs1);
    c_patch((address)&insn, 12, 10, 0b111);
    c_patch((address)&insn, 15, 13, 0b100);
    emit_int16(insn);
  }

  void mul(Register Rd, Register Rs1, Register Rs2) {
    if (Rd != Rs1 && Rd != Rs2) {
      // Three registers needed without a mv, emit uncompressed
      _mul(Rd, Rs1, Rs2);
      return;
    }

    // Rd is either Rs1 or Rs2
    if (!do_compress_zcb(Rs2, Rs1)) {
      _mul(Rd, Rs1, Rs2);
    } else {
      if (Rd == Rs2) {
        Rs2 = Rs1;
      } else {
        assert(Rd == Rs1, "must be");
      }
      c_mul(Rd, Rs2);
    }
  }

  // Stack overflow checking
  virtual void bang_stack_with_offset(int offset) { Unimplemented(); }

  static bool is_simm5(int64_t x);
  static bool is_simm6(int64_t x);
  static bool is_simm12(int64_t x);
  static bool is_simm13(int64_t x);
  static bool is_simm18(int64_t x);
  static bool is_simm21(int64_t x);

  static bool is_uimm2(uint64_t x);
  static bool is_uimm3(uint64_t x);
  static bool is_uimm5(uint64_t x);
  static bool is_uimm6(uint64_t x);
  static bool is_uimm7(uint64_t x);
  static bool is_uimm8(uint64_t x);
  static bool is_uimm9(uint64_t x);
  static bool is_uimm10(uint64_t x);

  // The maximum range of a branch is fixed for the RISCV architecture.
  static const unsigned long branch_range = 1 * M;

  static bool reachable_from_branch_at(address branch, address target) {
    return g_uabs(target - branch) < branch_range;
  }

  // Decode the given instruction, checking if it's a 16-bit compressed
  // instruction and return the address of the next instruction.
  static address locate_next_instruction(address inst) {
    // Instruction wider than 16 bits has the two least-significant bits set.
    if ((0x3 & *inst) == 0x3) {
      return inst + instruction_size;
    } else {
      return inst + compressed_instruction_size;
    }
  }

  Assembler(CodeBuffer* code) : AbstractAssembler(code), _in_compressible_scope(true) {}
};

#endif // CPU_RISCV_ASSEMBLER_RISCV_HPP