VEX/priv/guest_arm64_helpers.c

/*---------------------------------------------------------------*/
/*--- begin                             guest_arm64_helpers.c ---*/
/*---------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2013-2017 OpenWorks
      info@open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "libvex_basictypes.h"
#include "libvex_emnote.h"
#include "libvex_guest_arm64.h"
#include "libvex_ir.h"
#include "libvex.h"

#include "main_util.h"
#include "main_globals.h"
#include "guest_generic_bb_to_IR.h"
#include "guest_arm64_defs.h"


/* This file contains helper functions for arm guest code.  Calls to
   these functions are generated by the back end.  These calls are of
   course in the host machine code and this file will be compiled to
   host machine code, so that all makes sense.

   Only change the signatures of these helper functions very
   carefully.  If you change the signature here, you'll have to change
   the parameters passed to it in the IR calls constructed by
   guest_arm64_toIR.c.
*/


/* Set to 1 to get detailed profiling info about individual N, Z, C
   and V flag evaluation. */
#define PROFILE_NZCV_FLAGS 0

#if PROFILE_NZCV_FLAGS

static UInt tab_eval[ARM64G_CC_OP_NUMBER][16];
static UInt initted = 0;
static UInt tot_evals = 0;

static void initCounts ( void )
{
   UInt i, j;
   for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
      for (j = 0; j < 16; j++) {
         tab_eval[i][j] = 0;
      }
   }
   initted = 1;
}

static void showCounts ( void )
{
   const HChar* nameCC[16]
      = { "EQ", "NE", "CS", "CC", "MI", "PL", "VS", "VC",
          "HI", "LS", "GE", "LT", "GT", "LE", "AL", "NV" };
   UInt i, j;
   ULong sum = 0;
   vex_printf("\nCC_OP          0         1         2         3    "
              "     4         5         6\n");
   vex_printf(  "--------------------------------------------------"
              "--------------------------\n");
   for (j = 0; j < 16; j++) {
      vex_printf("%2d %s  ", j, nameCC[j]);
      for (i = 0; i < ARM64G_CC_OP_NUMBER; i++) {
         vex_printf("%9d ", tab_eval[i][j]);
         sum += tab_eval[i][j];
      }
      vex_printf("\n");
   }
   vex_printf("(In total %llu calls)\n", sum);
}

#define NOTE_EVAL(_cc_op, _cond) \
   do { \
      if (!initted) initCounts(); \
      vassert( ((UInt)(_cc_op)) < ARM64G_CC_OP_NUMBER); \
      vassert( ((UInt)(_cond)) < 16); \
      tab_eval[(UInt)(_cc_op)][(UInt)(cond)]++;  \
      tot_evals++; \
      if (0 == (tot_evals & 0x7FFF)) \
        showCounts(); \
   } while (0)

#endif /* PROFILE_NZCV_FLAGS */


/* Calculate the N flag from the supplied thunk components, in the
   least significant bit of the word.  Returned bits 63:1 are zero. */
static
ULong arm64g_calculate_flag_n ( ULong cc_op, ULong cc_dep1,
                                ULong cc_dep2, ULong cc_dep3 )
{
   switch (cc_op) {
      case ARM64G_CC_OP_COPY: {
         /* (nzcv:28x0, unused, unused) */
         ULong nf   = (cc_dep1 >> ARM64G_CC_SHIFT_N) & 1;
         return nf;
      }
      case ARM64G_CC_OP_ADD32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL + argR;
         ULong nf   = (ULong)(res >> 31);
         return nf;
      }
      case ARM64G_CC_OP_ADD64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL + argR;
         ULong nf   = (ULong)(res >> 63);
         return nf;
      }
      case ARM64G_CC_OP_SUB32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL - argR;
         ULong nf   = (ULong)(res >> 31);
         return nf;
      }
      case ARM64G_CC_OP_SUB64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL - argR;
         ULong nf   = res >> 63;
         return nf;
      }
      case ARM64G_CC_OP_ADC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL + argR + oldC;
         ULong nf   = (ULong)(res >> 31);
         return nf;
      }
      case ARM64G_CC_OP_ADC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL + argR + oldC;
         ULong nf   = res >> 63;
         return nf;
      }
      case ARM64G_CC_OP_SBC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL - argR - (oldC ^ 1);
         ULong nf   = (ULong)(res >> 31);
         return nf;
      }
      case ARM64G_CC_OP_SBC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL - argR - (oldC ^ 1);
         ULong nf   = res >> 63;
         return nf;
      }
      case ARM64G_CC_OP_LOGIC32: {
         /* (res, unused, unused) */
         UInt  res = (UInt)cc_dep1;
         ULong nf  = res >> 31;
         return nf;
      }
      case ARM64G_CC_OP_LOGIC64: {
         /* (res, unused, unused) */
         ULong res = cc_dep1;
         ULong nf  = res >> 63;
         return nf;
      }
//ZZ       case ARMG_CC_OP_MUL: {
//ZZ          /* (res, unused, oldC:oldV) */
//ZZ          UInt res  = cc_dep1;
//ZZ          UInt nf   = res >> 31;
//ZZ          return nf;
//ZZ       }
//ZZ       case ARMG_CC_OP_MULL: {
//ZZ          /* (resLo32, resHi32, oldC:oldV) */
//ZZ          UInt resHi32 = cc_dep2;
//ZZ          UInt nf      = resHi32 >> 31;
//ZZ          return nf;
//ZZ       }
      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("arm64g_calculate_flag_n"
                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
         vpanic("arm64g_calculate_flag_n");
   }
}


/* Calculate the Z flag from the supplied thunk components, in the
   least significant bit of the word.  Returned bits 63:1 are zero. */
static
ULong arm64g_calculate_flag_z ( ULong cc_op, ULong cc_dep1,
                                ULong cc_dep2, ULong cc_dep3 )
{
   switch (cc_op) {
      case ARM64G_CC_OP_COPY: {
         /* (nzcv:28x0, unused, unused) */
         ULong zf   = (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1;
         return zf;
      }
      case ARM64G_CC_OP_ADD32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL + argR;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_ADD64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL + argR;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_SUB32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL - argR;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_SUB64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL - argR;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_ADC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL + argR + oldC;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_ADC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL + argR + oldC;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_SBC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL - argR - (oldC ^ 1);
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_SBC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL - argR - (oldC ^ 1);
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_LOGIC32: {
         /* (res, unused, unused) */
         UInt  res  = (UInt)cc_dep1;
         ULong zf   = res == 0;
         return zf;
      }
      case ARM64G_CC_OP_LOGIC64: {
         /* (res, unused, unused) */
         ULong res  = cc_dep1;
         ULong zf   = res == 0;
         return zf;
      }
//ZZ       case ARMG_CC_OP_MUL: {
//ZZ          /* (res, unused, oldC:oldV) */
//ZZ          UInt res  = cc_dep1;
//ZZ          UInt zf   = res == 0;
//ZZ          return zf;
//ZZ       }
//ZZ       case ARMG_CC_OP_MULL: {
//ZZ          /* (resLo32, resHi32, oldC:oldV) */
//ZZ          UInt resLo32 = cc_dep1;
//ZZ          UInt resHi32 = cc_dep2;
//ZZ          UInt zf      = (resHi32|resLo32) == 0;
//ZZ          return zf;
//ZZ       }
      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("arm64g_calculate_flag_z"
                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
         vpanic("arm64g_calculate_flag_z");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate the C flag from the supplied thunk components, in the
   least significant bit of the word.  Returned bits 63:1 are zero. */
ULong arm64g_calculate_flag_c ( ULong cc_op, ULong cc_dep1,
                                ULong cc_dep2, ULong cc_dep3 )
{
   switch (cc_op) {
      case ARM64G_CC_OP_COPY: {
         /* (nzcv:28x0, unused, unused) */
         ULong cf = (cc_dep1 >> ARM64G_CC_SHIFT_C) & 1;
         return cf;
      }
      case ARM64G_CC_OP_ADD32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL + argR;
         ULong cf   = res < argL;
         return cf;
      }
      case ARM64G_CC_OP_ADD64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL + argR;
         ULong cf   = res < argL;
         return cf;
      }
      case ARM64G_CC_OP_SUB32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         ULong cf   = argL >= argR;
         return cf;
      }
      case ARM64G_CC_OP_SUB64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong cf   = argL >= argR;
         return cf;
      }
      case ARM64G_CC_OP_ADC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL + argR + oldC;
         ULong cf   = oldC ? (res <= argL) : (res < argL);
         return cf;
      }
      case ARM64G_CC_OP_ADC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL + argR + oldC;
         ULong cf   = oldC ? (res <= argL) : (res < argL);
         return cf;
      }
      case ARM64G_CC_OP_SBC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong cf   = oldC ? (argL >= argR) : (argL > argR);
         return cf;
      }
      case ARM64G_CC_OP_SBC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong cf   = oldC ? (argL >= argR) : (argL > argR);
         return cf;
      }
      case ARM64G_CC_OP_LOGIC32:
      case ARM64G_CC_OP_LOGIC64: {
         /* (res, unused, unused) */
         return 0; // C after logic is zero on arm64
      }
//ZZ       case ARMG_CC_OP_MUL: {
//ZZ          /* (res, unused, oldC:oldV) */
//ZZ          UInt oldC = (cc_dep3 >> 1) & 1;
//ZZ          vassert((cc_dep3 & ~3) == 0);
//ZZ          UInt cf   = oldC;
//ZZ          return cf;
//ZZ       }
//ZZ       case ARMG_CC_OP_MULL: {
//ZZ          /* (resLo32, resHi32, oldC:oldV) */
//ZZ          UInt oldC    = (cc_dep3 >> 1) & 1;
//ZZ          vassert((cc_dep3 & ~3) == 0);
//ZZ          UInt cf      = oldC;
//ZZ          return cf;
//ZZ       }
      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("arm64g_calculate_flag_c"
                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
         vpanic("arm64g_calculate_flag_c");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate the V flag from the supplied thunk components, in the
   least significant bit of the word.  Returned bits 63:1 are zero. */
static
ULong arm64g_calculate_flag_v ( ULong cc_op, ULong cc_dep1,
                                ULong cc_dep2, ULong cc_dep3 )
{
   switch (cc_op) {
      case ARM64G_CC_OP_COPY: {
         /* (nzcv:28x0, unused, unused) */
         ULong vf   = (cc_dep1 >> ARM64G_CC_SHIFT_V) & 1;
         return vf;
      }
      case ARM64G_CC_OP_ADD32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL + argR;
         ULong vf   = (ULong)(((res ^ argL) & (res ^ argR)) >> 31);
         return vf;
      }
      case ARM64G_CC_OP_ADD64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL + argR;
         ULong vf   = ((res ^ argL) & (res ^ argR)) >> 63;
         return vf;
      }
      case ARM64G_CC_OP_SUB32: {
         /* (argL, argR, unused) */
         UInt  argL = (UInt)cc_dep1;
         UInt  argR = (UInt)cc_dep2;
         UInt  res  = argL - argR;
         ULong vf   = (ULong)(((argL ^ argR) & (argL ^ res)) >> 31);
         return vf;
      }
      case ARM64G_CC_OP_SUB64: {
         /* (argL, argR, unused) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong res  = argL - argR;
         ULong vf   = (((argL ^ argR) & (argL ^ res))) >> 63;
         return vf;
      }
      case ARM64G_CC_OP_ADC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL + argR + oldC;
         ULong vf   = (ULong)(((res ^ argL) & (res ^ argR)) >> 31);
         return vf;
      }
      case ARM64G_CC_OP_ADC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL + argR + oldC;
         ULong vf   = ((res ^ argL) & (res ^ argR)) >> 63;
         return vf;
      }
      case ARM64G_CC_OP_SBC32: {
         /* (argL, argR, oldC) */
         UInt  argL = cc_dep1;
         UInt  argR = cc_dep2;
         UInt  oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         UInt  res  = argL - argR - (oldC ^ 1);
         ULong vf   = (ULong)(((argL ^ argR) & (argL ^ res)) >> 31);
         return vf;
      }
      case ARM64G_CC_OP_SBC64: {
         /* (argL, argR, oldC) */
         ULong argL = cc_dep1;
         ULong argR = cc_dep2;
         ULong oldC = cc_dep3;
         vassert((oldC & ~1) == 0);
         ULong res  = argL - argR - (oldC ^ 1);
         ULong vf   = ((argL ^ argR) & (argL ^ res)) >> 63;
         return vf;
      }
      case ARM64G_CC_OP_LOGIC32:
      case ARM64G_CC_OP_LOGIC64: {
         /* (res, unused, unused) */
         return 0; // V after logic is zero on arm64
      }
//ZZ       case ARMG_CC_OP_MUL: {
//ZZ          /* (res, unused, oldC:oldV) */
//ZZ          UInt oldV = (cc_dep3 >> 0) & 1;
//ZZ          vassert((cc_dep3 & ~3) == 0);
//ZZ          UInt vf   = oldV;
//ZZ          return vf;
//ZZ       }
//ZZ       case ARMG_CC_OP_MULL: {
//ZZ          /* (resLo32, resHi32, oldC:oldV) */
//ZZ          UInt oldV    = (cc_dep3 >> 0) & 1;
//ZZ          vassert((cc_dep3 & ~3) == 0);
//ZZ          UInt vf      = oldV;
//ZZ          return vf;
//ZZ       }
      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("arm64g_calculate_flag_v"
                    "( op=%llu, dep1=0x%llx, dep2=0x%llx, dep3=0x%llx )\n",
                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
         vpanic("arm64g_calculate_flag_v");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate NZCV from the supplied thunk components, in the positions
   they appear in the CPSR, viz bits 31:28 for N Z C V respectively.
   Returned bits 27:0 are zero. */
ULong arm64g_calculate_flags_nzcv ( ULong cc_op, ULong cc_dep1,
                                    ULong cc_dep2, ULong cc_dep3 )
{
   ULong f;
   ULong res = 0;
   f = 1 & arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
   res |= (f << ARM64G_CC_SHIFT_N);
   f = 1 & arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
   res |= (f << ARM64G_CC_SHIFT_Z);
   f = 1 & arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
   res |= (f << ARM64G_CC_SHIFT_C);
   f = 1 & arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
   res |= (f << ARM64G_CC_SHIFT_V);
   return res;
}

//ZZ
//ZZ /* CALLED FROM GENERATED CODE: CLEAN HELPER */
//ZZ /* Calculate the QC flag from the arguments, in the lowest bit
//ZZ    of the word (bit 0).  Urr, having this out of line is bizarre.
//ZZ    Push back inline. */
//ZZ UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
//ZZ                               UInt resR1, UInt resR2 )
//ZZ {
//ZZ    if (resL1 != resR1 || resL2 != resR2)
//ZZ       return 1;
//ZZ    else
//ZZ       return 0;
//ZZ }

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
/* Calculate the specified condition from the thunk components, in the
   lowest bit of the word (bit 0).  Returned bits 63:1 are zero. */
ULong arm64g_calculate_condition ( /* ARM64Condcode << 4 | cc_op */
                                   ULong cond_n_op ,
                                   ULong cc_dep1,
                                   ULong cc_dep2, ULong cc_dep3 )
{
   ULong cond  = cond_n_op >> 4;
   ULong cc_op = cond_n_op & 0xF;
   ULong inv   = cond & 1;
   ULong nf, zf, vf, cf;

#  if PROFILE_NZCV_FLAGS
   NOTE_EVAL(cc_op, cond);
#  endif

   //   vex_printf("XXXXXXXX %llx %llx %llx %llx\n",
   //              cond_n_op, cc_dep1, cc_dep2, cc_dep3);

   switch (cond) {
      case ARM64CondEQ:    // Z=1         => z
      case ARM64CondNE:    // Z=0
         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ zf;

      case ARM64CondCS:    // C=1         => c
      case ARM64CondCC:    // C=0
         cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ cf;

      case ARM64CondMI:    // N=1         => n
      case ARM64CondPL:    // N=0
         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ nf;

      case ARM64CondVS:    // V=1         => v
      case ARM64CondVC:    // V=0
         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ vf;

      case ARM64CondHI:    // C=1 && Z=0   => c & ~z
      case ARM64CondLS:    // C=0 || Z=1
         cf = arm64g_calculate_flag_c(cc_op, cc_dep1, cc_dep2, cc_dep3);
         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ (1 & (cf & ~zf));

      case ARM64CondGE:    // N=V          => ~(n^v)
      case ARM64CondLT:    // N!=V
         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ (1 & ~(nf ^ vf));

      case ARM64CondGT:    // Z=0 && N=V   => ~z & ~(n^v)  =>  ~(z | (n^v))
      case ARM64CondLE:    // Z=1 || N!=V
         nf = arm64g_calculate_flag_n(cc_op, cc_dep1, cc_dep2, cc_dep3);
         vf = arm64g_calculate_flag_v(cc_op, cc_dep1, cc_dep2, cc_dep3);
         zf = arm64g_calculate_flag_z(cc_op, cc_dep1, cc_dep2, cc_dep3);
         return inv ^ (1 & ~(zf | (nf ^ vf)));

      case ARM64CondAL:    // 1
      case ARM64CondNV:    // 1
         return 1;

      default:
         /* shouldn't really make these calls from generated code */
         vex_printf("arm64g_calculate_condition(ARM64)"
                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
                    cond, cc_op, cc_dep1, cc_dep2, cc_dep3 );
         vpanic("armg_calculate_condition(ARM64)");
   }
}


/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32b ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFULL) ^ acc;
   for (i = 0; i < 8; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32h ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFFFULL) ^ acc;
   for (i = 0; i < 16; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32w ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFFFFFFFULL) ^ acc;
   for (i = 0; i < 32; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32x ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = bits ^ acc;
   for (i = 0; i < 64; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0xEDB88320ULL : 0);
   return crc;

}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32cb ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFULL) ^ acc;
   for (i = 0; i < 8; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32ch ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFFFULL) ^ acc;
   for (i = 0; i < 16; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32cw ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = (bits & 0xFFFFFFFFULL) ^ acc;
   for (i = 0; i < 32; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
   return crc;
}

/* CALLED FROM GENERATED CODE: CLEAN HELPER */
ULong arm64g_calc_crc32cx ( ULong acc, ULong bits )
{
   UInt  i;
   ULong crc = bits ^ acc;
   for (i = 0; i < 64; i++)
      crc = (crc >> 1) ^ ((crc & 1) ? 0x82F63B78ULL : 0);
   return crc;
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-arm64 platforms, return 0. */
ULong arm64g_dirtyhelper_MRS_CNTVCT_EL0 ( void )
{
#  if defined(__aarch64__) && !defined(__arm__)
   ULong w = 0x5555555555555555ULL; /* overwritten */
   __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(w));
   return w;
#  else
   return 0ULL;
#  endif
}


/* CALLED FROM GENERATED CODE */
/* DIRTY HELPER (non-referentially-transparent) */
/* Horrible hack.  On non-arm64 platforms, return 0. */
ULong arm64g_dirtyhelper_MRS_CNTFRQ_EL0 ( void )
{
#  if defined(__aarch64__) && !defined(__arm__)
   ULong w = 0x5555555555555555ULL; /* overwritten */
   __asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(w));
   return w;
#  else
   return 0ULL;
#  endif
}


void arm64g_dirtyhelper_PMULLQ ( /*OUT*/V128* res, ULong arg1, ULong arg2 )
{
   /* This doesn't need to be a dirty helper, except for the fact that
      a clean helper can't return a 128 bit value.  This is a pretty
      lame implementation of PMULLQ, but at least it doesn't contain any
      data dependent branches, and has lots of ILP.  I guess we could unroll
      the loop completely and offer extensive prayers to the gods of ILP
      if more performance is needed. */
   UInt i;
   ULong accHi = 0, accLo = 0;
   ULong op2Hi = 0, op2Lo = arg2;
   for (i = 0; i < 64; i++) {
      /* Make |mask| be all 0s or all 1s, a copy of arg1[i] */
      Long mask = arg1 << (63-i);
      mask >>= 63;
      accHi ^= (op2Hi & mask);
      accLo ^= (op2Lo & mask);
      /* do: op2Hi:op2Lo <<=u 1 */
      op2Hi <<= 1;
      op2Hi |= ((op2Lo >> 63) & 1);
      op2Lo <<= 1;
   }
   res->w64[1] = accHi;
   res->w64[0] = accLo;
}


/*---------------------------------------------------------------*/
/*--- Crypto instruction helpers                              ---*/
/*---------------------------------------------------------------*/

/* DIRTY HELPERS for doing AES support:
   * AESE (SubBytes, then ShiftRows)
   * AESD (InvShiftRows, then InvSubBytes)
   * AESMC (MixColumns)
   * AESIMC (InvMixColumns)
   These don't actually have to be dirty helpers -- they could be
   clean, but for the fact that they return a V128 and a clean helper
   can't do that.

   The ARMv8 manual seems to imply that AESE first performs ShiftRows,
   then SubBytes.  This seems to contradict FIPS 197, so the
   implementation below is consistent with FIPS 197.  One can observe
   that the two transformations commute -- the order in which they
   happen makes no difference to the result.  So the ambiguity doesn't
   actually matter, but it is confusing.  The v8 manual looks correct
   about AESD, though.

   The three functions rj_xtime, aesMixColumn and aesInvMixColumn only,
   are taken from "A byte-oriented AES-256 implementation" and are subject
   to the following usage terms:

     Byte-oriented AES-256 implementation.
     All lookup tables replaced with 'on the fly' calculations.

     Copyright (c) 2007-2011 Ilya O. Levin, http://www.literatecode.com
     Other contributors: Hal Finney

     Permission to use, copy, modify, and distribute this software for any
     purpose with or without fee is hereby granted, provided that the above
     copyright notice and this permission notice appear in all copies.

     THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/

const UChar aesMapSubBytes[256]
   = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
       0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
       0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
       0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
       0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
       0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
       0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
       0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
       0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
       0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
       0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
       0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
       0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
       0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
       0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
       0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
       0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
       0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
       0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
       0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
       0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
       0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
       0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
       0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
       0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
       0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
       0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
       0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
       0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
       0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
       0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
       0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
     };

const UChar aesMapInvSubBytes[256]
   = { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
       0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
       0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
       0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
       0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
       0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
       0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
       0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
       0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
       0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
       0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
       0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
       0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
       0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
       0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
       0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
       0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
       0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
       0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
       0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
       0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
       0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
       0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
       0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
       0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
       0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
       0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
       0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
       0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
       0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
       0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
       0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
     };

static inline UChar rj_xtime ( UChar x )
{
   UChar y = (UChar)(x << 1);
   return (x & 0x80) ? (y ^ 0x1b) : y;
}

static void aesMixColumn ( /*MOD*/UChar* r )
{
   UChar a = r[0];
   UChar b = r[1];
   UChar c = r[2];
   UChar d = r[3];
   UChar e = a ^ b ^ c ^ d;
   r[0] ^= e ^ rj_xtime(a ^ b);
   r[1] ^= e ^ rj_xtime(b ^ c);
   r[2] ^= e ^ rj_xtime(c ^ d);
   r[3] ^= e ^ rj_xtime(d ^ a);
}

static void aesInvMixColumn ( /*MOD*/UChar* r )
{
   UChar a = r[0];
   UChar b = r[1];
   UChar c = r[2];
   UChar d = r[3];
   UChar e = a ^ b ^ c ^ d;
   UChar z = rj_xtime(e);
   UChar x = e ^ rj_xtime(rj_xtime(z ^ a ^ c));
   UChar y = e ^ rj_xtime(rj_xtime(z ^ b ^ d));
   r[0] ^= x ^ rj_xtime(a ^ b);
   r[1] ^= y ^ rj_xtime(b ^ c);
   r[2] ^= x ^ rj_xtime(c ^ d);
   r[3] ^= y ^ rj_xtime(d ^ a);
}


/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_AESE ( /*OUT*/V128* res, ULong argHi, ULong argLo )
{
   res->w64[1] = argHi;
   res->w64[0] = argLo;

   /* First do SubBytes on the State. */
   UInt i;
   for (i = 0; i < 16; i++) {
      res->w8[i] = aesMapSubBytes[res->w8[i] & 0xFF];
   }

   /* Then do ShiftRows on the State. */
#  define XX(_ix) res->w8[_ix]
   { UChar old1 = XX(1);
     XX(1) = XX(5); XX(5) = XX(9); XX(9) = XX(13); XX(13) = old1;
   }
   { UChar old2 = XX(2); UChar old6 = XX(6);
     XX(2) = XX(10); XX(6) = XX(14); XX(10) = old2; XX(14) = old6;
   }
   { UChar old15 = XX(15);
     XX(15) = XX(11); XX(11) = XX(7); XX(7) = XX(3); XX(3) = old15;
   }
#  undef XX
}


/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_AESD ( /*OUT*/V128* res, ULong argHi, ULong argLo )
{
   res->w64[1] = argHi;
   res->w64[0] = argLo;

   /* First do InvShiftRows on the State. */
#  define XX(_ix) res->w8[_ix]
   { UChar old13 = XX(13);
     XX(13) = XX(9); XX(9) = XX(5); XX(5) = XX(1); XX(1) = old13;
   }
   { UChar old14 = XX(14); UChar old10 = XX(10);
     XX(14) = XX(6); XX(10) = XX(2); XX(6) = old14; XX(2) = old10;
   }
   { UChar old3 = XX(3);
     XX(3) = XX(7); XX(7) = XX(11); XX(11) = XX(15); XX(15) = old3;
   }
#  undef XX

/* Then do InvSubBytes on the State. */
   UInt i;
   for (i = 0; i < 16; i++) {
      res->w8[i] = aesMapInvSubBytes[res->w8[i] & 0xFF];
   }
}


/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_AESMC ( /*OUT*/V128* res, ULong argHi, ULong argLo )
{
   res->w64[1] = argHi;
   res->w64[0] = argLo;
   aesMixColumn(&res->w8[0]);
   aesMixColumn(&res->w8[4]);
   aesMixColumn(&res->w8[8]);
   aesMixColumn(&res->w8[12]);
}


/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_AESIMC ( /*OUT*/V128* res, ULong argHi, ULong argLo )
{
   res->w64[1] = argHi;
   res->w64[0] = argLo;
   aesInvMixColumn(&res->w8[0]);
   aesInvMixColumn(&res->w8[4]);
   aesInvMixColumn(&res->w8[8]);
   aesInvMixColumn(&res->w8[12]);
}


/* DIRTY HELPERS for SHA instruction support.  As with the AES helpers
   above, these are actually pure functions and are only dirty because
   clean helpers can't return a V128. */

static inline UInt ROL32 ( UInt x, UInt sh ) {
   vassert(sh > 0 && sh < 32);
   return (x << sh) | (x >> (32 - sh));
}

static inline UInt ROR32 ( UInt x, UInt sh ) {
   vassert(sh > 0 && sh < 32);
   return (x >> sh) | (x << (32 - sh));
}

static inline UInt SHAchoose ( UInt x, UInt y, UInt z ) {
   return ((y ^ z) & x) ^ z;
}

static inline UInt SHAmajority ( UInt x, UInt y, UInt z ) {
   return (x & y) | ((x | y) & z);
}

static inline UInt SHAparity ( UInt x, UInt y, UInt z ) {
   return x ^ y ^ z;
}

static inline UInt SHAhashSIGMA0 ( UInt x ) {
   return ROR32(x, 2) ^ ROR32(x, 13) ^ ROR32(x, 22);
}

static inline UInt SHAhashSIGMA1 ( UInt x ) {
   return ROR32(x, 6) ^ ROR32(x, 11) ^ ROR32(x, 25);
}

static void SHA256hash ( /*MOD*/V128* X, /*MOD*/V128* Y, const V128* W )
{
   UInt e;
   for (e = 0; e <= 3; e++) {
      UInt chs = SHAchoose(Y->w32[0], Y->w32[1], Y->w32[2]);
      UInt maj = SHAmajority(X->w32[0], X->w32[1], X->w32[2]);
      UInt t   = Y->w32[3] + SHAhashSIGMA1(Y->w32[0]) + chs + W->w32[e];
      X->w32[3] = t + X->w32[3];
      Y->w32[3] = t + SHAhashSIGMA0(X->w32[0]) + maj;
      UInt ts = Y->w32[3];
      Y->w32[3] = Y->w32[2];
      Y->w32[2] = Y->w32[1];
      Y->w32[1] = Y->w32[0];
      Y->w32[0] = X->w32[3];
      X->w32[3] = X->w32[2];
      X->w32[2] = X->w32[1];
      X->w32[1] = X->w32[0];
      X->w32[0] = ts;
   }
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1C ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   vassert(nHi == 0);
   vassert((nLo >> 32) == 0);
   V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
   UInt Y; Y = (UInt)nLo;
   V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
   UInt e;
   for (e = 0; e <= 3; e++) {
      UInt t = SHAchoose(X.w32[1], X.w32[2], X.w32[3]);
      Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
      X.w32[1] = ROL32(X.w32[1], 30);
      UInt oldY = Y;
      Y = X.w32[3];
      X.w32[3] = X.w32[2];
      X.w32[2] = X.w32[1];
      X.w32[1] = X.w32[0];
      X.w32[0] = oldY;
   }
   res->w64[1] = X.w64[1];
   res->w64[0] = X.w64[0];
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1H ( /*OUT*/V128* res, ULong nHi, ULong nLo )
{
   vassert(nHi == 0);
   vassert((nLo >> 32) == 0);
   res->w32[3] = res->w32[2] = res->w32[1] = 0;
   res->w32[0] = ROL32((UInt)nLo, 30);
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1M ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   vassert(nHi == 0);
   vassert((nLo >> 32) == 0);
   V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
   UInt Y; Y = (UInt)nLo;
   V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
   UInt e;
   for (e = 0; e <= 3; e++) {
      UInt t = SHAmajority(X.w32[1], X.w32[2], X.w32[3]);
      Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
      X.w32[1] = ROL32(X.w32[1], 30);
      UInt oldY = Y;
      Y = X.w32[3];
      X.w32[3] = X.w32[2];
      X.w32[2] = X.w32[1];
      X.w32[1] = X.w32[0];
      X.w32[0] = oldY;
   }
   res->w64[1] = X.w64[1];
   res->w64[0] = X.w64[0];
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1P ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   vassert(nHi == 0);
   vassert((nLo >> 32) == 0);
   V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
   UInt Y; Y = (UInt)nLo;
   V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
   UInt e;
   for (e = 0; e <= 3; e++) {
      UInt t = SHAparity(X.w32[1], X.w32[2], X.w32[3]);
      Y = Y + ROL32(X.w32[0], 5) + t + W.w32[e];
      X.w32[1] = ROL32(X.w32[1], 30);
      UInt oldY = Y;
      Y = X.w32[3];
      X.w32[3] = X.w32[2];
      X.w32[2] = X.w32[1];
      X.w32[1] = X.w32[0];
      X.w32[0] = oldY;
   }
   res->w64[1] = X.w64[1];
   res->w64[0] = X.w64[0];
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1SU0 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                  ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   res->w64[1] = nLo;
   res->w64[0] = dHi;
   res->w64[1] ^= dHi ^ mHi;
   res->w64[0] ^= dLo ^ mLo;
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA1SU1 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                  ULong nHi, ULong nLo )
{
   /* This computes "T = Vd ^ (Vn >>u 32)" */
   V128 T; T.w64[1] = nHi; T.w64[0] = nLo;
   T.w32[0] = T.w32[1];
   T.w32[1] = T.w32[2];
   T.w32[2] = T.w32[3];
   T.w32[3] = 0;
   T.w64[1] ^= dHi;
   T.w64[0] ^= dLo;
   /* */
   res->w32[0] = ROL32(T.w32[0], 1);
   res->w32[1] = ROL32(T.w32[1], 1);
   res->w32[2] = ROL32(T.w32[2], 1);
   res->w32[3] = ROL32(T.w32[3], 1) ^ ROL32(T.w32[0], 2);
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA256H2 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                   ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   V128 X; X.w64[1] = nHi; X.w64[0] = nLo;
   V128 Y; Y.w64[1] = dHi; Y.w64[0] = dLo;
   V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
   SHA256hash(&X, &Y, &W);
   res->w64[1] = Y.w64[1];
   res->w64[0] = Y.w64[0];
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA256H ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                  ULong nHi, ULong nLo, ULong mHi, ULong mLo )
{
   V128 X; X.w64[1] = dHi; X.w64[0] = dLo;
   V128 Y; Y.w64[1] = nHi; Y.w64[0] = nLo;
   V128 W; W.w64[1] = mHi; W.w64[0] = mLo;
   SHA256hash(&X, &Y, &W);
   res->w64[1] = X.w64[1];
   res->w64[0] = X.w64[0];
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA256SU0 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                    ULong nHi, ULong nLo )

{
   res->w64[1] = res->w64[0] = 0;
   V128 op1; op1.w64[1] = dHi; op1.w64[0] = dLo;
   V128 op2; op2.w64[1] = nHi; op2.w64[0] = nLo;
   V128 T;
   T.w32[3] = op2.w32[0];
   T.w32[2] = op1.w32[3];
   T.w32[1] = op1.w32[2];
   T.w32[0] = op1.w32[1];
   UInt e;
   for (e = 0; e <= 3; e++) {
      UInt elt = T.w32[e];
      elt = ROR32(elt, 7) ^ ROR32(elt, 18) ^ (elt >> 3);
      res->w32[e] = elt + op1.w32[e];
   }
}

/* CALLED FROM GENERATED CODE */
void arm64g_dirtyhelper_SHA256SU1 ( /*OUT*/V128* res, ULong dHi, ULong dLo,
                                    ULong nHi, ULong nLo,
                                    ULong mHi, ULong mLo )
{
   res->w64[0] = res->w64[1] = 0;
   V128 op1; op1.w64[1] = dHi; op1.w64[0] = dLo;
   V128 op2; op2.w64[1] = nHi; op2.w64[0] = nLo;
   V128 op3; op3.w64[1] = mHi; op3.w64[0] = mLo;
   V128 T0;
   T0.w32[3] = op3.w32[0];
   T0.w32[2] = op2.w32[3];
   T0.w32[1] = op2.w32[2];
   T0.w32[0] = op2.w32[1];
   UInt T1[2];
   UInt e;
   T1[1] = op3.w32[3];
   T1[0] = op3.w32[2];
   for (e = 0; e <= 1; e++) {
      UInt elt = T1[e];
      elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ (elt >> 10);
      elt = elt + op1.w32[e] + T0.w32[e];
      res->w32[e] = elt;
   }
   T1[1] = res->w32[1];
   T1[0] = res->w32[0];
   for (e = 2; e <= 3; e++) {
      UInt elt = T1[e-2];
      elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ (elt >> 10);
      elt = elt + op1.w32[e] + T0.w32[e];
      res->w32[e] = elt;
   }
}


/*---------------------------------------------------------------*/
/*--- Flag-helpers translation-time function specialisers.    ---*/
/*--- These help iropt specialise calls the above run-time    ---*/
/*--- flags functions.                                        ---*/
/*---------------------------------------------------------------*/

/* Used by the optimiser to try specialisations.  Returns an
   equivalent expression, or NULL if none. */

static Bool isU64 ( IRExpr* e, ULong n )
{
   return
      toBool( e->tag == Iex_Const
              && e->Iex.Const.con->tag == Ico_U64
              && e->Iex.Const.con->Ico.U64 == n );
}

IRExpr* guest_arm64_spechelper ( const HChar* function_name,
                                 IRExpr** args,
                                 IRStmt** precedingStmts,
                                 Int      n_precedingStmts )
{
#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))

   Int i, arity = 0;
   for (i = 0; args[i]; i++)
      arity++;
//ZZ #  if 0
//ZZ    vex_printf("spec request:\n");
//ZZ    vex_printf("   %s  ", function_name);
//ZZ    for (i = 0; i < arity; i++) {
//ZZ       vex_printf("  ");
//ZZ       ppIRExpr(args[i]);
//ZZ    }
//ZZ    vex_printf("\n");
//ZZ #  endif

   /* --------- specialising "arm64g_calculate_condition" --------- */

   if (vex_streq(function_name, "arm64g_calculate_condition")) {

      /* specialise calls to the "arm64g_calculate_condition" function.
         Not sure whether this is strictly necessary, but: the
         replacement IR must produce only the values 0 or 1.  Bits
         63:1 are required to be zero. */
      IRExpr *cond_n_op, *cc_dep1, *cc_dep2  ; //, *cc_ndep;
      vassert(arity == 4);
      cond_n_op = args[0]; /* (ARM64Condcode << 4)  |  ARM64G_CC_OP_* */
      cc_dep1   = args[1];
      cc_dep2   = args[2];
      //cc_ndep   = args[3];

      /*---------------- SUB64 ----------------*/

      /* 0, 1 */
      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB64)) {
         /* EQ after SUB --> test argL == argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ64, cc_dep1, cc_dep2));
      }
      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB64)) {
         /* NE after SUB --> test argL != argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE64, cc_dep1, cc_dep2));
      }

      /* 2, 3 */
      if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB64)) {
         /* CS after SUB --> test argL >=u argR
                         --> test argR <=u argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
      }
      if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB64)) {
         /* CC after SUB --> test argL <u argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
      }

      /* 8, 9 */
      if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB64)) {
         /* LS after SUB --> test argL <=u argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
      }
      if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB64)) {
         /* HI after SUB --> test argL >u argR
                         --> test argR <u argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64U, cc_dep2, cc_dep1));
      }

      /* 10, 11 */
      if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB64)) {
         /* LT after SUB --> test argL <s argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
      }
      if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB64)) {
         /* GE after SUB --> test argL >=s argR
                         --> test argR <=s argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64S, cc_dep2, cc_dep1));
      }

      /* 12, 13 */
      if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB64)) {
         /* GT after SUB --> test argL >s argR
                         --> test argR <s argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT64S, cc_dep2, cc_dep1));
      }
      if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB64)) {
         /* LE after SUB --> test argL <=s argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE64S, cc_dep1, cc_dep2));
      }

      /*---------------- SUB32 ----------------*/

      /* 0, 1 */
      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_SUB32)) {
         /* EQ after SUB --> test argL == argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpEQ32, unop(Iop_64to32, cc_dep1),
                                        unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_SUB32)) {
         /* NE after SUB --> test argL != argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpNE32, unop(Iop_64to32, cc_dep1),
                                        unop(Iop_64to32, cc_dep2)));
      }

      /* 2, 3 */
      if (isU64(cond_n_op, (ARM64CondCS << 4) | ARM64G_CC_OP_SUB32)) {
         /* CS after SUB --> test argL >=u argR
                         --> test argR <=u argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep2),
                                         unop(Iop_64to32, cc_dep1)));
      }
      if (isU64(cond_n_op, (ARM64CondCC << 4) | ARM64G_CC_OP_SUB32)) {
         /* CC after SUB --> test argL <u argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep1),
                                         unop(Iop_64to32, cc_dep2)));
      }

      /* 8, 9 */
      if (isU64(cond_n_op, (ARM64CondLS << 4) | ARM64G_CC_OP_SUB32)) {
         /* LS after SUB --> test argL <=u argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32U, unop(Iop_64to32, cc_dep1),
                                         unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cond_n_op, (ARM64CondHI << 4) | ARM64G_CC_OP_SUB32)) {
         /* HI after SUB --> test argL >u argR
                         --> test argR <u argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32U, unop(Iop_64to32, cc_dep2),
                                         unop(Iop_64to32, cc_dep1)));
      }

      /* 10, 11 */
      if (isU64(cond_n_op, (ARM64CondLT << 4) | ARM64G_CC_OP_SUB32)) {
         /* LT after SUB --> test argL <s argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep1),
                                         unop(Iop_64to32, cc_dep2)));
      }
      if (isU64(cond_n_op, (ARM64CondGE << 4) | ARM64G_CC_OP_SUB32)) {
         /* GE after SUB --> test argL >=s argR
                         --> test argR <=s argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep2),
                                         unop(Iop_64to32, cc_dep1)));
      }

      /* 12, 13 */
      if (isU64(cond_n_op, (ARM64CondGT << 4) | ARM64G_CC_OP_SUB32)) {
         /* GT after SUB --> test argL >s argR
                         --> test argR <s argL */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLT32S, unop(Iop_64to32, cc_dep2),
                                         unop(Iop_64to32, cc_dep1)));
      }
      if (isU64(cond_n_op, (ARM64CondLE << 4) | ARM64G_CC_OP_SUB32)) {
         /* LE after SUB --> test argL <=s argR */
         return unop(Iop_1Uto64,
                     binop(Iop_CmpLE32S, unop(Iop_64to32, cc_dep1),
                                         unop(Iop_64to32, cc_dep2)));
      }

//ZZ       /*---------------- SBB ----------------*/
//ZZ
//ZZ       if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SBB)) {
//ZZ          /* This seems to happen a lot in softfloat code, eg __divdf3+140 */
//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
//ZZ          /* HS after SBB (same as C after SBB below)
//ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
//ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
//ZZ          */
//ZZ          return
//ZZ             IRExpr_ITE(
//ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
//ZZ                /* case oldC != 0 */
//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
//ZZ                /* case oldC == 0 */
//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
//ZZ             );
//ZZ       }
//ZZ
//ZZ       /*---------------- LOGIC ----------------*/
//ZZ
//ZZ       if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_LOGIC)) {
//ZZ          /* EQ after LOGIC --> test res == 0 */
//ZZ          return unop(Iop_1Uto32,
//ZZ                      binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
//ZZ       }
//ZZ       if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_LOGIC)) {
//ZZ          /* NE after LOGIC --> test res != 0 */
//ZZ          return unop(Iop_1Uto32,
//ZZ                      binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
//ZZ       }
//ZZ
//ZZ       if (isU32(cond_n_op, (ARMCondPL << 4) | ARMG_CC_OP_LOGIC)) {
//ZZ          /* PL after LOGIC --> test (res >> 31) == 0 */
//ZZ          return unop(Iop_1Uto32,
//ZZ                      binop(Iop_CmpEQ32,
//ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
//ZZ                            mkU32(0)));
//ZZ       }
//ZZ       if (isU32(cond_n_op, (ARMCondMI << 4) | ARMG_CC_OP_LOGIC)) {
//ZZ          /* MI after LOGIC --> test (res >> 31) == 1 */
//ZZ          return unop(Iop_1Uto32,
//ZZ                      binop(Iop_CmpEQ32,
//ZZ                            binop(Iop_Shr32, cc_dep1, mkU8(31)),
//ZZ                            mkU32(1)));
//ZZ       }

      /*---------------- COPY ----------------*/

      if (isU64(cond_n_op, (ARM64CondEQ << 4) | ARM64G_CC_OP_COPY)) {
         /* EQ after COPY --> (cc_dep1 >> ARM64G_CC_SHIFT_Z) & 1 */
         return binop(Iop_And64,
                      binop(Iop_Shr64, cc_dep1,
                                       mkU8(ARM64G_CC_SHIFT_Z)),
                      mkU64(1));
      }
      if (isU64(cond_n_op, (ARM64CondNE << 4) | ARM64G_CC_OP_COPY)) {
         /* NE after COPY --> ((cc_dep1 >> ARM64G_CC_SHIFT_Z) ^ 1) & 1 */
         return binop(Iop_And64,
                      binop(Iop_Xor64,
                            binop(Iop_Shr64, cc_dep1,
                                             mkU8(ARM64G_CC_SHIFT_Z)),
                            mkU64(1)),
                      mkU64(1));
      }

//ZZ       /*----------------- AL -----------------*/
//ZZ
//ZZ       /* A critically important case for Thumb code.
//ZZ
//ZZ          What we're trying to spot is the case where cond_n_op is an
//ZZ          expression of the form Or32(..., 0xE0) since that means the
//ZZ          caller is asking for CondAL and we can simply return 1
//ZZ          without caring what the ... part is.  This is a potentially
//ZZ          dodgy kludge in that it assumes that the ... part has zeroes
//ZZ          in bits 7:4, so that the result of the Or32 is guaranteed to
//ZZ          be 0xE in bits 7:4.  Given that the places where this first
//ZZ          arg are constructed (in guest_arm_toIR.c) are very
//ZZ          constrained, we can get away with this.  To make this
//ZZ          guaranteed safe would require to have a new primop, Slice44
//ZZ          or some such, thusly
//ZZ
//ZZ          Slice44(arg1, arg2) = 0--(24)--0 arg1[7:4] arg2[3:0]
//ZZ
//ZZ          and we would then look for Slice44(0xE0, ...)
//ZZ          which would give the required safety property.
//ZZ
//ZZ          It would be infeasibly expensive to scan backwards through
//ZZ          the entire block looking for an assignment to the temp, so
//ZZ          just look at the previous 16 statements.  That should find it
//ZZ          if it is an interesting case, as a result of how the
//ZZ          boilerplate guff at the start of each Thumb insn translation
//ZZ          is made.
//ZZ       */
//ZZ       if (cond_n_op->tag == Iex_RdTmp) {
//ZZ          Int    j;
//ZZ          IRTemp look_for = cond_n_op->Iex.RdTmp.tmp;
//ZZ          Int    limit    = n_precedingStmts - 16;
//ZZ          if (limit < 0) limit = 0;
//ZZ          if (0) vex_printf("scanning %d .. %d\n", n_precedingStmts-1, limit);
//ZZ          for (j = n_precedingStmts - 1; j >= limit; j--) {
//ZZ             IRStmt* st = precedingStmts[j];
//ZZ             if (st->tag == Ist_WrTmp
//ZZ                 && st->Ist.WrTmp.tmp == look_for
//ZZ                 && st->Ist.WrTmp.data->tag == Iex_Binop
//ZZ                 && st->Ist.WrTmp.data->Iex.Binop.op == Iop_Or32
//ZZ                 && isU32(st->Ist.WrTmp.data->Iex.Binop.arg2, (ARMCondAL << 4)))
//ZZ                return mkU32(1);
//ZZ          }
//ZZ          /* Didn't find any useful binding to the first arg
//ZZ             in the previous 16 stmts. */
//ZZ       }
   }

//ZZ    /* --------- specialising "armg_calculate_flag_c" --------- */
//ZZ
//ZZ    else
//ZZ    if (vex_streq(function_name, "armg_calculate_flag_c")) {
//ZZ
//ZZ       /* specialise calls to the "armg_calculate_flag_c" function.
//ZZ          Note that the returned value must be either 0 or 1; nonzero
//ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
//ZZ          values (from the thunk) are assumed to have bits 31:1
//ZZ          clear. */
//ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
//ZZ       vassert(arity == 4);
//ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
//ZZ       cc_dep1 = args[1];
//ZZ       cc_dep2 = args[2];
//ZZ       cc_ndep = args[3];
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
//ZZ          /* Thunk args are (result, shco, oldV) */
//ZZ          /* C after LOGIC --> shco */
//ZZ          return cc_dep2;
//ZZ       }
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
//ZZ          /* Thunk args are (argL, argR, unused) */
//ZZ          /* C after SUB --> argL >=u argR
//ZZ                         --> argR <=u argL */
//ZZ          return unop(Iop_1Uto32,
//ZZ                      binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
//ZZ       }
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
//ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
//ZZ          /* C after SBB (same as HS after SBB above)
//ZZ             --> oldC ? (argL >=u argR) : (argL >u argR)
//ZZ             --> oldC ? (argR <=u argL) : (argR <u argL)
//ZZ          */
//ZZ          return
//ZZ             IRExpr_ITE(
//ZZ                binop(Iop_CmpNE32, cc_ndep, mkU32(0)),
//ZZ                /* case oldC != 0 */
//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLE32U, cc_dep2, cc_dep1)),
//ZZ                /* case oldC == 0 */
//ZZ                unop(Iop_1Uto32, binop(Iop_CmpLT32U, cc_dep2, cc_dep1))
//ZZ             );
//ZZ       }
//ZZ
//ZZ    }
//ZZ
//ZZ    /* --------- specialising "armg_calculate_flag_v" --------- */
//ZZ
//ZZ    else
//ZZ    if (vex_streq(function_name, "armg_calculate_flag_v")) {
//ZZ
//ZZ       /* specialise calls to the "armg_calculate_flag_v" function.
//ZZ          Note that the returned value must be either 0 or 1; nonzero
//ZZ          bits 31:1 are not allowed.  In turn, incoming oldV and oldC
//ZZ          values (from the thunk) are assumed to have bits 31:1
//ZZ          clear. */
//ZZ       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
//ZZ       vassert(arity == 4);
//ZZ       cc_op   = args[0]; /* ARMG_CC_OP_* */
//ZZ       cc_dep1 = args[1];
//ZZ       cc_dep2 = args[2];
//ZZ       cc_ndep = args[3];
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_LOGIC)) {
//ZZ          /* Thunk args are (result, shco, oldV) */
//ZZ          /* V after LOGIC --> oldV */
//ZZ          return cc_ndep;
//ZZ       }
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_SUB)) {
//ZZ          /* Thunk args are (argL, argR, unused) */
//ZZ          /* V after SUB
//ZZ             --> let res = argL - argR
//ZZ                 in ((argL ^ argR) & (argL ^ res)) >> 31
//ZZ             --> ((argL ^ argR) & (argL ^ (argL - argR))) >> 31
//ZZ          */
//ZZ          IRExpr* argL = cc_dep1;
//ZZ          IRExpr* argR = cc_dep2;
//ZZ          return
//ZZ             binop(Iop_Shr32,
//ZZ                   binop(Iop_And32,
//ZZ                         binop(Iop_Xor32, argL, argR),
//ZZ                         binop(Iop_Xor32, argL, binop(Iop_Sub32, argL, argR))
//ZZ                   ),
//ZZ                   mkU8(31)
//ZZ             );
//ZZ       }
//ZZ
//ZZ       if (isU32(cc_op, ARMG_CC_OP_SBB)) {
//ZZ          /* This happens occasionally in softfloat code, eg __divdf3+140 */
//ZZ          /* thunk is: (dep1=argL, dep2=argR, ndep=oldC) */
//ZZ          /* V after SBB
//ZZ             --> let res = argL - argR - (oldC ^ 1)
//ZZ                 in  (argL ^ argR) & (argL ^ res) & 1
//ZZ          */
//ZZ          return
//ZZ             binop(
//ZZ                Iop_And32,
//ZZ                binop(
//ZZ                   Iop_And32,
//ZZ                   // argL ^ argR
//ZZ                   binop(Iop_Xor32, cc_dep1, cc_dep2),
//ZZ                   // argL ^ (argL - argR - (oldC ^ 1))
//ZZ                   binop(Iop_Xor32,
//ZZ                         cc_dep1,
//ZZ                         binop(Iop_Sub32,
//ZZ                               binop(Iop_Sub32, cc_dep1, cc_dep2),
//ZZ                               binop(Iop_Xor32, cc_ndep, mkU32(1)))
//ZZ                   )
//ZZ                ),
//ZZ                mkU32(1)
//ZZ             );
//ZZ       }
//ZZ
//ZZ    }

#  undef unop
#  undef binop
#  undef mkU64
#  undef mkU8

   return NULL;
}


/*----------------------------------------------*/
/*--- The exported fns ..                    ---*/
/*----------------------------------------------*/

//ZZ /* VISIBLE TO LIBVEX CLIENT */
//ZZ #if 0
//ZZ void LibVEX_GuestARM_put_flags ( UInt flags_native,
//ZZ                                  /*OUT*/VexGuestARMState* vex_state )
//ZZ {
//ZZ    vassert(0); // FIXME
//ZZ
//ZZ    /* Mask out everything except N Z V C. */
//ZZ    flags_native
//ZZ       &= (ARMG_CC_MASK_N | ARMG_CC_MASK_Z | ARMG_CC_MASK_V | ARMG_CC_MASK_C);
//ZZ
//ZZ    vex_state->guest_CC_OP   = ARMG_CC_OP_COPY;
//ZZ    vex_state->guest_CC_DEP1 = flags_native;
//ZZ    vex_state->guest_CC_DEP2 = 0;
//ZZ    vex_state->guest_CC_NDEP = 0;
//ZZ }
//ZZ #endif

/* VISIBLE TO LIBVEX CLIENT */
ULong LibVEX_GuestARM64_get_nzcv ( /*IN*/const VexGuestARM64State* vex_state )
{
   ULong nzcv = 0;
   // NZCV
   nzcv |= arm64g_calculate_flags_nzcv(
               vex_state->guest_CC_OP,
               vex_state->guest_CC_DEP1,
               vex_state->guest_CC_DEP2,
               vex_state->guest_CC_NDEP
            );
   vassert(0 == (nzcv & 0xFFFFFFFF0FFFFFFFULL));
//ZZ    // Q
//ZZ    if (vex_state->guest_QFLAG32 > 0)
//ZZ       cpsr |= (1 << 27);
//ZZ    // GE
//ZZ    if (vex_state->guest_GEFLAG0 > 0)
//ZZ       cpsr |= (1 << 16);
//ZZ    if (vex_state->guest_GEFLAG1 > 0)
//ZZ       cpsr |= (1 << 17);
//ZZ    if (vex_state->guest_GEFLAG2 > 0)
//ZZ       cpsr |= (1 << 18);
//ZZ    if (vex_state->guest_GEFLAG3 > 0)
//ZZ       cpsr |= (1 << 19);
//ZZ    // M
//ZZ    cpsr |= (1 << 4); // 0b10000 means user-mode
//ZZ    // J,T   J (bit 24) is zero by initialisation above
//ZZ    // T  we copy from R15T[0]
//ZZ    if (vex_state->guest_R15T & 1)
//ZZ       cpsr |= (1 << 5);
//ZZ    // ITSTATE we punt on for the time being.  Could compute it
//ZZ    // if needed though.
//ZZ    // E, endianness, 0 (littleendian) from initialisation above
//ZZ    // A,I,F disable some async exceptions.  Not sure about these.
//ZZ    // Leave as zero for the time being.
   return nzcv;
}

/* VISIBLE TO LIBVEX CLIENT */
ULong LibVEX_GuestARM64_get_fpsr ( const VexGuestARM64State* vex_state )
{
   UInt w32 = vex_state->guest_QCFLAG[0] | vex_state->guest_QCFLAG[1]
              | vex_state->guest_QCFLAG[2] | vex_state->guest_QCFLAG[3];
   ULong fpsr = 0;
   // QC
   if (w32 != 0)
      fpsr |= (1 << 27);
   return fpsr;
}

void LibVEX_GuestARM64_set_fpsr ( /*MOD*/VexGuestARM64State* vex_state,
                                  ULong fpsr )
{
   // QC
   vex_state->guest_QCFLAG[0] = (UInt)((fpsr >> 27) & 1);
   vex_state->guest_QCFLAG[1] = 0;
   vex_state->guest_QCFLAG[2] = 0;
   vex_state->guest_QCFLAG[3] = 0;
}

/* VISIBLE TO LIBVEX CLIENT */
void LibVEX_GuestARM64_initialise ( /*OUT*/VexGuestARM64State* vex_state )
{
   vex_bzero(vex_state, sizeof(*vex_state));
//ZZ    vex_state->host_EvC_FAILADDR = 0;
//ZZ    vex_state->host_EvC_COUNTER = 0;
//ZZ
//ZZ    vex_state->guest_R0  = 0;
//ZZ    vex_state->guest_R1  = 0;
//ZZ    vex_state->guest_R2  = 0;
//ZZ    vex_state->guest_R3  = 0;
//ZZ    vex_state->guest_R4  = 0;
//ZZ    vex_state->guest_R5  = 0;
//ZZ    vex_state->guest_R6  = 0;
//ZZ    vex_state->guest_R7  = 0;
//ZZ    vex_state->guest_R8  = 0;
//ZZ    vex_state->guest_R9  = 0;
//ZZ    vex_state->guest_R10 = 0;
//ZZ    vex_state->guest_R11 = 0;
//ZZ    vex_state->guest_R12 = 0;
//ZZ    vex_state->guest_R13 = 0;
//ZZ    vex_state->guest_R14 = 0;
//ZZ    vex_state->guest_R15T = 0;  /* NB: implies ARM mode */
//ZZ
   vex_state->guest_CC_OP   = ARM64G_CC_OP_COPY;
//ZZ    vex_state->guest_CC_DEP1 = 0;
//ZZ    vex_state->guest_CC_DEP2 = 0;
//ZZ    vex_state->guest_CC_NDEP = 0;
//ZZ    vex_state->guest_QFLAG32 = 0;
//ZZ    vex_state->guest_GEFLAG0 = 0;
//ZZ    vex_state->guest_GEFLAG1 = 0;
//ZZ    vex_state->guest_GEFLAG2 = 0;
//ZZ    vex_state->guest_GEFLAG3 = 0;
//ZZ
//ZZ    vex_state->guest_EMNOTE  = EmNote_NONE;
//ZZ    vex_state->guest_CMSTART = 0;
//ZZ    vex_state->guest_CMLEN   = 0;
//ZZ    vex_state->guest_NRADDR  = 0;
//ZZ    vex_state->guest_IP_AT_SYSCALL = 0;
//ZZ
//ZZ    vex_state->guest_D0  = 0;
//ZZ    vex_state->guest_D1  = 0;
//ZZ    vex_state->guest_D2  = 0;
//ZZ    vex_state->guest_D3  = 0;
//ZZ    vex_state->guest_D4  = 0;
//ZZ    vex_state->guest_D5  = 0;
//ZZ    vex_state->guest_D6  = 0;
//ZZ    vex_state->guest_D7  = 0;
//ZZ    vex_state->guest_D8  = 0;
//ZZ    vex_state->guest_D9  = 0;
//ZZ    vex_state->guest_D10 = 0;
//ZZ    vex_state->guest_D11 = 0;
//ZZ    vex_state->guest_D12 = 0;
//ZZ    vex_state->guest_D13 = 0;
//ZZ    vex_state->guest_D14 = 0;
//ZZ    vex_state->guest_D15 = 0;
//ZZ    vex_state->guest_D16 = 0;
//ZZ    vex_state->guest_D17 = 0;
//ZZ    vex_state->guest_D18 = 0;
//ZZ    vex_state->guest_D19 = 0;
//ZZ    vex_state->guest_D20 = 0;
//ZZ    vex_state->guest_D21 = 0;
//ZZ    vex_state->guest_D22 = 0;
//ZZ    vex_state->guest_D23 = 0;
//ZZ    vex_state->guest_D24 = 0;
//ZZ    vex_state->guest_D25 = 0;
//ZZ    vex_state->guest_D26 = 0;
//ZZ    vex_state->guest_D27 = 0;
//ZZ    vex_state->guest_D28 = 0;
//ZZ    vex_state->guest_D29 = 0;
//ZZ    vex_state->guest_D30 = 0;
//ZZ    vex_state->guest_D31 = 0;
//ZZ
//ZZ    /* ARM encoded; zero is the default as it happens (result flags
//ZZ       (NZCV) cleared, FZ disabled, round to nearest, non-vector mode,
//ZZ       all exns masked, all exn sticky bits cleared). */
//ZZ    vex_state->guest_FPSCR = 0;
//ZZ
//ZZ    vex_state->guest_TPIDRURO = 0;
//ZZ
//ZZ    /* Not in a Thumb IT block. */
//ZZ    vex_state->guest_ITSTATE = 0;
//ZZ
//ZZ    vex_state->padding1 = 0;
//ZZ    vex_state->padding2 = 0;
//ZZ    vex_state->padding3 = 0;
//ZZ    vex_state->padding4 = 0;
//ZZ    vex_state->padding5 = 0;
}


/*-----------------------------------------------------------*/
/*--- Describing the arm guest state, for the benefit     ---*/
/*--- of iropt and instrumenters.                         ---*/
/*-----------------------------------------------------------*/

/* Figure out if any part of the guest state contained in minoff
   .. maxoff requires precise memory exceptions.  If in doubt return
   True (but this generates significantly slower code).

   We enforce precise exns for guest SP, PC, 29(FP), 30(LR).
   That might be overkill (for 29 and 30); I don't know.
*/
Bool guest_arm64_state_requires_precise_mem_exns (
        Int minoff, Int maxoff, VexRegisterUpdates pxControl
     )
{
   Int xsp_min = offsetof(VexGuestARM64State, guest_XSP);
   Int xsp_max = xsp_min + 8 - 1;
   Int pc_min  = offsetof(VexGuestARM64State, guest_PC);
   Int pc_max  = pc_min + 8 - 1;

   if (maxoff < xsp_min || minoff > xsp_max) {
      /* no overlap with xsp */
      if (pxControl == VexRegUpdSpAtMemAccess)
         return False; // We only need to check stack pointer.
   } else {
      return True;
   }

   if (maxoff < pc_min || minoff > pc_max) {
      /* no overlap with pc */
   } else {
      return True;
   }

   /* Guessing that we need PX for FP, but I don't really know. */
   Int x29_min = offsetof(VexGuestARM64State, guest_X29);
   Int x29_max = x29_min + 8 - 1;

   if (maxoff < x29_min || minoff > x29_max) {
      /* no overlap with x29 */
   } else {
      return True;
   }

   /* Guessing that we need PX for LR, but I don't really know. */
   Int x30_min = offsetof(VexGuestARM64State, guest_X30);
   Int x30_max = x30_min + 8 - 1;

   if (maxoff < x30_min || minoff > x30_max) {
      /* no overlap with r30 */
   } else {
      return True;
   }

   return False;
}


#define ALWAYSDEFD(field)                             \
    { offsetof(VexGuestARM64State, field),            \
      (sizeof ((VexGuestARM64State*)0)->field) }
VexGuestLayout
   arm64Guest_layout
      = {
          /* Total size of the guest state, in bytes. */
          .total_sizeB = sizeof(VexGuestARM64State),

          /* Describe the stack pointer. */
          .offset_SP = offsetof(VexGuestARM64State,guest_XSP),
          .sizeof_SP = 8,

          /* Describe the instruction pointer. */
          .offset_IP = offsetof(VexGuestARM64State,guest_PC),
          .sizeof_IP = 8,

          /* Describe any sections to be regarded by Memcheck as
             'always-defined'. */
          .n_alwaysDefd = 9,

          /* flags thunk: OP is always defd, whereas DEP1 and DEP2
             have to be tracked.  See detailed comment in gdefs.h on
             meaning of thunk fields. */
          .alwaysDefd
             = { /* 0 */ ALWAYSDEFD(guest_PC),
                 /* 1 */ ALWAYSDEFD(guest_CC_OP),
                 /* 2 */ ALWAYSDEFD(guest_CC_NDEP),
                 /* 3 */ ALWAYSDEFD(guest_EMNOTE),
                 /* 4 */ ALWAYSDEFD(guest_CMSTART),
                 /* 5 */ ALWAYSDEFD(guest_CMLEN),
                 /* 6 */ ALWAYSDEFD(guest_NRADDR),
                 /* 7 */ ALWAYSDEFD(guest_IP_AT_SYSCALL),
                 /* 8 */ ALWAYSDEFD(guest_TPIDR_EL0)
               }
        };


/*---------------------------------------------------------------*/
/*--- end                               guest_arm64_helpers.c ---*/
/*---------------------------------------------------------------*/