/*
Created by Petr Svenda http://www.svenda.com/petr

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

   1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
   2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
   3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Based on example non-optimized code for for Rijndael by J. Daemon and V. Rijmen

USAGE:
    // allocate engine
    JavaCardAES aesCipher = new JavaCardAES();
    // set array with initiualization vector
    aesCipher.m_IV = array_with_IV;
    aesCipher.m_IVOffset = 0;

    // schedule keys for first key into array array_for_round_keys_1
    aesCipher.RoundKeysSchedule(array_with_key1, (short) 0, array_for_round_keys_1);
    // encrypt block with first key
    aesCipher.AESEncryptBlock(data_to_encrypt, start_offset_of_data, array_for_round_keys_1);

    // schedule keys for second key into array array_for_round_keys_2
    aesCipher.RoundKeysSchedule(array_with_key_2, (short) 0, array_for_round_keys_2);
    // decrypt block with second key
    aesCipher.AESDecryptBlock(data_to_decrypt_2, start_offset_of_data, array_for_round_keys_2);


APPLIED OPTIMIZATIONS:
- UNROLLED LOOPS (only minor effect as compiler is doing that also)
- PRE-COMPUTED Alogtable and Logtable (common)
- PRE-COMPUTED Alogtable_mul2 and Alogtable_mul3 (will speed-up MixColumn computation
   with 'mul((byte) 2, a[(short) (i + hlp)])' and 'mul((byte) 3, a[(short) (i + hlp)])' commands)
   * due to space-constraints, InvMixColumn is NOT optimized this way (separate tables for 0xe, 0xb, 0xd, 0x9 are needed)
   * note, on Cyberflex 32K e-gate there is time saving only 1 second from 9 sec to 8 sec (and tables needs 512B)
   * if have to be used, then uncomment parts  ALOG_MUL

SPEED (Cyberflex 32k e-gate):
- encryption (one block) on 9 second  (when MixColumn "removed", then only 4 sec => so you may try to optimize MixColumn)
- key schedule 4 seconds
- reduced version with 7 rounds only - 6 seconds (!! see note located above N_ROUNDS)

SPEED (GXP E64PK):
- encryption (one block) less than 1 second

/**/

package JCAES;
import javacard.framework.*;

public class JavaCardAES {
  final static short SW_IV_BAD                        = (short) 0x6709;   // BAD INICIALIZATION VECTOR
  final static short SW_CIPHER_DATA_LENGTH_BAD        = (short) 0x6710;   // BAD LENGTH OF DATA USED DURING CIPHER OPERATION

    // NOTE: BLOCKN & KEYN CONSTANTS ARE DEFINED
    // ONLY FOR BETTER READIBILITY OF CODE AND CANNOT BE CHANGED!!!
    final public static byte BLOCKLEN                  = (byte) (128 / 8);
    final static byte BLOCKN    		= (byte) (128 / 32);
    final static byte KEYN    		        = (byte) (128 / 32);
    final static short STATELEN                 = (short) (4 * BLOCKN);

    // IMPORTANT: THIS IMPLEMENTATION IS CONSTRUCTED FOR 128bit KEY and 128bit BLOCK
    // FOR THIS SETTING, 10 ITERATION ROUNDS ARE GIVEN IN SPECIFICATION
    // HOWEVER, NUMBER OF THESE ROUNDS CAN BE DECREASED - CURRENTLY (2006) BEST KNOWN PRACTICALLY REALISABLE ATTACK
    // IS AGAINST REDUCED ALG. WITH 6 ROUNDS AND REQUIRE: 2^32 choosen plaintexts and 2^44 time steps (http://www.schneier.com/paper-rijndael.pdf)
    // THEREFORE 7 ROUNDS CANNOT BE ATTACKED RIGHT NOW (2006) ANF IF YOU *KNOW WHAT YOUR ARE DOING*,
    // THEN REDUCE ROUNDS AND GET 30% SPEED-UP
    // NOTE THAT ALGORITHM WILL NOT BE BINARY COMPATIBLE WITH AES TEST VECTORS ANYMORE
    public static byte N_ROUNDS    		= (byte) 10;

    final static byte rcon[] = {(byte) 0x01, (byte) 0x02, (byte) 0x04, (byte) 0x08, (byte) 0x10, (byte) 0x20, (byte) 0x40, (byte) 0x80, (byte) 0x1b, (byte) 0x36};

    // shifts[0..3] -> ENCRYPT, shifts[4..7] ... DECRYPT
    final static byte shifts[] = { 0, 1, 2, 3, 0, 3, 2, 1};

    // NOTE: NEXT ARRAYS COULD BE DECLARED STATIC, BUT UNKNOWN PROBLEM OCCURES
    // DURING APPLET INSTALATION ON Gemplus GXPPro-R3.
    private byte SBox[] = null;
    private byte SiBox[] = null;
    private byte Alogtable[] = null;
// ALOG_MUL    private byte Alogtable_mul2[] = null;
// ALOG_MUL    private byte Alogtable_mul3[] = null;
    private short Logtable[] = null;

    // SCHEDULED ROUND KEYS
    //private byte roundKeys[] = null;

    // PREALOCATED REUSED TRANSIENT BUFFER
    private byte tempBuffer[] = null;

    // INICIALIZATION VECTOR
    public byte       m_IV[] = null;
    public short      m_IVOffset = 0;

    public JavaCardAES() {
      // ALLOCATE AND COMPUTE LOOKUP TABLES
      SBox = new byte[256];
      SiBox = new byte[256];
      Alogtable = new byte[256];
// ALOG_MUL     Alogtable_mul2 = new byte[256];
// ALOG_MUL     Alogtable_mul3 = new byte[256];
// ALOG_MUL     Alogtable_mul2 = JCSystem.makeTransientByteArray((short)256, JCSystem.CLEAR_ON_RESET);
// ALOG_MUL     Alogtable_mul3 = JCSystem.makeTransientByteArray((short)256, JCSystem.CLEAR_ON_RESET);
      Logtable = new short[256];
      tempBuffer = JCSystem.makeTransientByteArray(STATELEN, JCSystem.CLEAR_ON_RESET);
      MakeSBox();
    }

    // CALCULATION OF LOOKUP TABLES FOR REDUCING CODE SIZE
    private void MakeSBox() {
      byte   p = 1;
      short  q;
      short  i;

      // Alogtable AND Logtable TABLES
      for (i=0; i<256; ++i) {
          Alogtable[i]= (byte) p;
          Logtable[(p >= 0) ? p : (short) (256 + p)]= (byte) i;
          p=(byte) (p^(p<<1)^(((p&0x80) == 0) ? 0: 0x01b));
      }
      // CORRECTION OF GENERATED LOG TABLE IS NEEDED
      Logtable[1] = 0;

      // SBox AND SiBox TABLES
      for (i=0; i<256; ++i)  {
         p= ((i == 0) ? 0 : (Alogtable[(short) (255-((Logtable[i] >= 0) ? Logtable[i] : (short) (256 + Logtable[i])))]));
         q= (p >= 0) ? p : (short) (256 + p);
         q= (short) ((q>>7) | (q<<1)); p^= (byte) q;
         q= (short) ((q>>7) | (q<<1)); p^= (byte) q;
         q= (short) ((q>>7) | (q<<1)); p^= (byte) q;
         q= (short) ((q>>7) | (q<<1)); p^= (byte) q;
         p= (byte) (p^0x63);
         SBox[i] = (byte) p;
         SiBox[(p >= 0) ? p : (short) (256 + p)] = (byte) i;
      }

      // CONVERT LogTable FROM byte-oriented value into short-oriented
      for (i=0; i<256; ++i) {
        if (Logtable[i] < 0) Logtable[i] = (short) (256 + Logtable[i]);
      }

/*// ALOG_MUL
      // PRE-COMPUTE Alogtable_mul2 and Alogtable_mul3
      Alogtable_mul2[0] = 0;
      for (i=1; i < 256; i++) Alogtable_mul2[i] = (byte) Alogtable[(short) ((short) (Logtable[2] + Logtable[i]) % 255)];
      Alogtable_mul3[0] = 0;
      for (i=1; i < 256; i++) Alogtable_mul3[i] = (byte) Alogtable[(short) ((short) (Logtable[3] + Logtable[i]) % 255)];
/**/
    }

    /**
     * Sechedule AES round keys fro given key material
     * @param key ... key array
     * @param keyOffset ... start offset in key array
     * @param aesRoundKeys ... array to hold scheduled keys
     */
    public void RoundKeysSchedule(byte key[], short keyOffset, byte aesRoundKeys[]) {
      byte     i;
      byte     j;
      byte     round;
      byte     rconpointer = 0;
      short    sourceOffset = 0;
      short    targetOffset = 0;
      // hlp CONTAINS PRECALCULATED EXPRESSION (round * (4 * KEYN))
      short    hlp = 0;

      // FIRST KEY (SAME AS INPUT KEY)
      Util.arrayCopyNonAtomic(key, keyOffset, aesRoundKeys, (short) 0, STATELEN);

      // 10 ROUNDS KEYS
      for (round = 1; round <= N_ROUNDS; round++) {
          // TIME REDUCING PRECALCULATION
          hlp += STATELEN;

          // COPY KEY FOR round - 1 TO BUFFER FOR round
          Util.arrayCopyNonAtomic(aesRoundKeys, (short) ((round - 1) * STATELEN), aesRoundKeys, hlp, STATELEN);

          rconpointer = (byte) (round - 1);

          for (i = 0; i < 4; i++) {
            sourceOffset = (short) ( ((i + 1) % 4) + ((KEYN-1) * 4) + hlp );
            targetOffset = (short) ( i + (0 * 4) + hlp );
            aesRoundKeys[targetOffset] ^= SBox[(aesRoundKeys[sourceOffset] >= 0) ? aesRoundKeys[sourceOffset] : (short) (256 + aesRoundKeys[sourceOffset])];
          }

          aesRoundKeys[hlp] ^= rcon[rconpointer];

          for (j = 1; j < KEYN; j++) {
              for (i = 0; i < 4; i++) {
                sourceOffset = (short) (i + ((j - 1) * 4) + hlp);
                targetOffset = (short) ((i + (j * 4)) + hlp);
                aesRoundKeys[targetOffset] ^= aesRoundKeys[sourceOffset];
              }
          }
      }
    }
/*
    //
    // NOT USED IN THIS IMPLEMENTATION EXCEPT UNOPTIMIZED VERSION
    //
    private static short TAUB(byte a) {
      // RETURN SHORT VALUE CONSTRUCTED FROM SIGNED REPRESENTATION OF UNSIGNED VALUE
      // EXAMPLE: byte val = (byte) 250; // val == -6
      //          ASSERT(TAUB(val) == 250);
      return ((a >= 0) ? a : (short) (256 + a));
    }

    // MULTIPLY TWO ELEMENTS OF GF(2^m)
    private byte mul(short a, short b) {
      if ((a != 0) && (b != 0)) {
        return (byte) Alogtable[(short) ((short) (Logtable[a] + Logtable[b]) % 255)];
      }
      else return (byte) 0;
    }

    // ADD ROUND KEY USING XOR
    private static void KeyAddition(byte a[], short dataOffset, byte rk[], short keyOffset) {
      byte i;
      for (i = 0; i < STATELEN; i++) a[(short) (i + dataOffset)] ^= rk[(short) (i + keyOffset)];
    }

    // SBox OR SiBox SUBSTITUTION
    private static void Substitution(byte a[], short dataOffset, byte box[]) {
      byte i;
      for (i = 0; i < STATELEN; i++) a[(short) (i + dataOffset)] = box[((a[(short) (i + dataOffset)] >= 0) ? a[(short) (i + dataOffset)] : (short) (256 + a[(short) (i + dataOffset)]))] ;
    }


/**/

    // SHIFTING ROWS
    private void ShiftRow(byte a[], short dataOffset, byte d) {
      byte i, j;
      // ALSO FIRST ROUND IS SHIFTED (BUT BY 0 POSITIONS) DUE TO POSSIBILITY FOR USING Util.arrayCopy() LATER
      // tempBuffer WILL CONTAINS SHIFTED STATE a
      for(i = 0; i < 4; i++) {
          for(j = 0; j < BLOCKN; j++) tempBuffer[(short) (i + j * 4)] = a[(short) (((i + (byte) ((j + shifts[(short) (i + d*4)] % BLOCKN) * 4)) % STATELEN) + dataOffset)];
      }
      Util.arrayCopyNonAtomic(tempBuffer, (short) 0, a, dataOffset, STATELEN);
    }


    // MIXING COLUMNS
    private void MixColumn(byte a[], short dataOffset) {
      byte  i = 0, j = 0;
      // hlp CONTAINS PRECALCULATED EXPRESSION ((j * 4) + dataOffset)
      short hlp = dataOffset;
      // hlp2 CONTAINS PRECALCULATED EXPRESSION (j * 4)
      byte hlp2 = -4;
      byte hlp3 = 0;
      short tempVal = 0;
      short tempVal2 = 0;
      short a0 = 0;
      short a1 = 0;
      short a2 = 0;
      short a3 = 0;

      hlp -= 4;
      for(j = 0; j < BLOCKN; j++) {
        // TIME REDUCING PRECALCULATION
        hlp += 4; hlp2 += 4;

/*        // UNROLL THIS LOOP:
//        for(i = 0; i < 4; i++) {
   // NOT OPTIMISED
          tempBuffer[(byte) (i + hlp2)] = (byte) mul((short) 2, TAUB(a[(short) (i + hlp)]));
          tempBuffer[(byte) (i + hlp2)] ^= (byte) mul((short) 3, TAUB(a[(short) (((i + 1) % 4) + hlp)]));
          tempBuffer[(byte) (i + hlp2)] ^= (byte) a[(short) (((i + 2) % 4) + hlp)];
          tempBuffer[(byte) (i + hlp2)] ^= (byte) a[(short) (((i + 3) % 4) + hlp)];
     // END NOT OPTIMISED
        }
/**/

     // *** OPT2: OPTIMIZED VERSION WITH UNROLLED LOOPS AND EXPANDED mul() FUNCTION
        // UNROLLED LOOP: for (i = 0; i < 4; i++)
        // ax WILL CONTAIN VALUE OF 'a[(short) (((i + x) % 4) + hlp)];' TRANSFORMED FROM byte TO short (via TAUB-like function)
          a0 = a[hlp]; a0 = (a0 >= 0) ? a0 : (short) (256 + a0);
          a1 = a[(short) (1 + hlp)]; a1 = (a1 >= 0) ? a1 : (short) (256 + a1);
          a2 = a[(short) (2 + hlp)]; a2 = (a2 >= 0) ? a2 : (short) (256 + a2);
          a3 = a[(short) (3 + hlp)]; a3 = (a3 >= 0) ? a3 : (short) (256 + a3);

          // i == 0
          // tempBuffer[hlp2] = (byte) mul((byte) 2, a0);
          tempBuffer[hlp2] = (a0 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[2] + Logtable[a0]) % 255)] : (byte) 0;
          // tempBuffer[hlp2] ^= (byte) mul((byte) 3, a1);
          if (a1 != 0) tempBuffer[hlp2] ^= (byte) Alogtable[(short) ((short) (Logtable[3] + Logtable[a1]) % 255)];
          tempBuffer[hlp2] ^= a2;
          tempBuffer[hlp2] ^= a3;

          // i == 1
          hlp3 = (byte) (hlp2 + 1);
          //tempBuffer[hlp3] = (byte) mul((byte) 2, a1);
          tempBuffer[hlp3] = (a1 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[2] + Logtable[a1]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 3, a2);
          if (a2 != 0) tempBuffer[hlp3] ^= (byte) Alogtable[(short) ((short) (Logtable[3] + Logtable[a2]) % 255)];
          tempBuffer[hlp3] ^= a3;
          tempBuffer[hlp3] ^= a0;

          // i == 2
          hlp3 = (byte) (hlp2 + 2);
          //tempBuffer[hlp3] = (byte) mul((byte) 2, a2);
          tempBuffer[hlp3] = (a2 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[2] + Logtable[a2]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 3, a3);
          if (a3 != 0) tempBuffer[hlp3] ^= (byte) Alogtable[(short) ((short) (Logtable[3] + Logtable[a3]) % 255)];
          tempBuffer[hlp3] ^= a0;
          tempBuffer[hlp3] ^= a1;

          // i == 3
          hlp3 = (byte) (hlp2 + 3);
          //tempBuffer[hlp3] = (byte) mul((byte) 2, a3);
          tempBuffer[hlp3] = (a3 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[2] + Logtable[a3]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 3, a0);
          if (a0 != 0) tempBuffer[hlp3] ^= (byte) Alogtable[(short) ((short) (Logtable[3] + Logtable[a0]) % 255)];
          tempBuffer[hlp3] ^= a1;
          tempBuffer[hlp3] ^= a2;

       //***  OPT2: END OPTIMIZED VERSION WITH UNROLLED LOOPS AND EXPANDED mul() FUNCTION /**/

/* // ALOG_MUL - disable OPT2: block when use OPT3
// *** OPT3: OPTIMIZED VERSION WITH UNROLLED LOOPS AND PRE-COMPUTED mul(2,x) AND  mul(3,x) FUNCTION
   // UNROLLED LOOP: for (i = 0; i < 4; i++)
   // ax WILL CONTAIN VALUE OF 'a[(short) (((i + x) % 4) + hlp)];' TRANSFORMED FROM byte TO short (via TAUB-like function)
          a0 = a[hlp]; a0 = (a0 >= 0) ? a0 : (short) (256 + a0);
          a1 = a[(short) (1 + hlp)]; a1 = (a1 >= 0) ? a1 : (short) (256 + a1);
          a2 = a[(short) (2 + hlp)]; a2 = (a2 >= 0) ? a2 : (short) (256 + a2);
          a3 = a[(short) (3 + hlp)]; a3 = (a3 >= 0) ? a3 : (short) (256 + a3);

          // i == 0
          tempBuffer[hlp2] = Alogtable_mul2[a0];
          tempBuffer[hlp2] ^= Alogtable_mul3[a1];
          tempBuffer[hlp2] ^= a2;
          tempBuffer[hlp2] ^= a3;

          // i == 1
          hlp3 = (byte) (hlp2 + 1);
          tempBuffer[hlp3] = Alogtable_mul2[a1];
          tempBuffer[hlp3] ^= Alogtable_mul3[a2];
          tempBuffer[hlp3] ^= a3;
          tempBuffer[hlp3] ^= a0;

          // i == 2
          hlp3 = (byte) (hlp2 + 2);
          tempBuffer[hlp3] = Alogtable_mul2[a2];
          tempBuffer[hlp3] ^= Alogtable_mul3[a3];
          tempBuffer[hlp3] ^= a0;
          tempBuffer[hlp3] ^= a1;

          // i == 3
          hlp3 = (byte) (hlp2 + 3);
          tempBuffer[hlp3] = Alogtable_mul2[a3];
          tempBuffer[hlp3] ^= Alogtable_mul3[a0];
          tempBuffer[hlp3] ^= a1;
          tempBuffer[hlp3] ^= a2;

     //*** OPT3: END OPTIMIZED VERSION WITH UNROLLED LOOPS AND PRE-COMPUTED mul(2,x) AND  mul(3,x) FUNCTION /**/
      }

      Util.arrayCopyNonAtomic(tempBuffer, (short) 0, a, dataOffset, STATELEN);
    }

    // INVERSE OF MIXING COLUMNS
    private void InvMixColumn(byte a[], short dataOffset) {
      byte i = 0, j = 0;
      // hlp CONTAINS PRECALCULATED EXPRESSION ((j * 4) + dataOffset)
      short hlp = dataOffset;
      // hlp2 CONTAINS PRECALCULATED EXPRESSION (j * 4)
      byte hlp2 = -4;
      byte hlp3 = 0;
      short a0 = 0;
      short a1 = 0;
      short a2 = 0;
      short a3 = 0;

      hlp -= 4;
      for(j = 0; j < BLOCKN; j++) {
        // TIME REDUCING PRECALCULATION
        hlp += 4; hlp2 += 4;
/*
        // TODO: UNROLL THIS LOOP:
        for(i = 0; i < 4; i++) {
          tempBuffer[(byte) (i + hlp2)] = (byte) mul((byte) 0xe, a[(short) (i + hlp)]);
          tempBuffer[(byte) (i + hlp2)] ^= (byte) mul((byte) 0xb, a[(short) (((i + 1) % 4) + hlp)]);
          tempBuffer[(byte) (i + hlp2)] ^= (byte) mul((byte) 0xd, a[(short) (((i + 2) % 4) + hlp)]);
          tempBuffer[(byte) (i + hlp2)] ^= (byte) mul((byte) 0x9, a[(short) (((i + 3) % 4) + hlp)]);
        }
/**/
          // UNROLLED LOOP
          a0 = a[hlp]; a0 = (a0 >= 0) ? a0 : (short) (256 + a0);
          a1 = a[(short) (1 + hlp)]; a1 = (a1 >= 0) ? a1 : (short) (256 + a1);
          a2 = a[(short) (2 + hlp)]; a2 = (a2 >= 0) ? a2 : (short) (256 + a2);
          a3 = a[(short) (3 + hlp)]; a3 = (a3 >= 0) ? a3 : (short) (256 + a3);

          // i == 0
          //tempBuffer[hlp2] = (byte) mul((byte) 0xe, a0);
          tempBuffer[hlp2] = (a0 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xe] + Logtable[a0]) % 255)] : (byte) 0;
          //tempBuffer[hlp2] ^= (byte) mul((byte) 0xb, a1);
          tempBuffer[hlp2] ^= (a1 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xb] + Logtable[a1]) % 255)] : (byte) 0;
          //tempBuffer[hlp2] ^= (byte) mul((byte) 0xd, a2);
          tempBuffer[hlp2] ^= (a2 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xd] + Logtable[a2]) % 255)] : (byte) 0;
          //tempBuffer[hlp2] ^= (byte) mul((byte) 0x9, a3);
          tempBuffer[hlp2] ^= (a3 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0x9] + Logtable[a3]) % 255)] : (byte) 0;

          // i == 1
          hlp3 = (byte) (hlp2 + 1);
          //tempBuffer[hlp3] = (byte) mul((byte) 0xe, a1);
          tempBuffer[hlp3] = (a1 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xe] + Logtable[a1]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xb, a2);
          tempBuffer[hlp3] ^= (a2 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xb] + Logtable[a2]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xd, a3);
          tempBuffer[hlp3] ^= (a3 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xd] + Logtable[a3]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0x9, a0);
          tempBuffer[hlp3] ^= (a0 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0x9] + Logtable[a0]) % 255)] : (byte) 0;

          // i == 2
          hlp3 = (byte) (hlp2 + 2);
          //tempBuffer[hlp3] = (byte) mul((byte) 0xe, a2);
          tempBuffer[hlp3] = (a2 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xe] + Logtable[a2]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xb, a3);
          tempBuffer[hlp3] ^= (a3 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xb] + Logtable[a3]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xd, a0);
          tempBuffer[hlp3] ^= (a0 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xd] + Logtable[a0]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0x9, a1);
          tempBuffer[hlp3] ^= (a1 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0x9] + Logtable[a1]) % 255)] : (byte) 0;

          // i == 3
          hlp3 = (byte) (hlp2 + 3);
          //tempBuffer[hlp3] = (byte) mul((byte) 0xe, a3);
          tempBuffer[hlp3] = (a3 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xe] + Logtable[a3]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xb, a0);
          tempBuffer[hlp3] ^= (a0 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xb] + Logtable[a0]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0xd, a1);
          tempBuffer[hlp3] ^= (a1 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0xd] + Logtable[a1]) % 255)] : (byte) 0;
          //tempBuffer[hlp3] ^= (byte) mul((byte) 0x9, a2);
          tempBuffer[hlp3] ^= (a2 != 0) ? (byte) Alogtable[(short) ((short) (Logtable[0x9] + Logtable[a2]) % 255)] : (byte) 0;
        // END OF UNROLLED LOOP /**/
      }

      Util.arrayCopyNonAtomic(tempBuffer, (short) 0, a, dataOffset, STATELEN);
    }


     /**
      * Encrypt one block, key schedule must be already processed
      * @param data ... data array to be encrypted
      * @param dataOffset ... start offset in data array
      * @param aesRoundKeys ... scheduled keys for AES (from RoundKeysSchedule() function)
      * @return true if encrypt success, false otherwise.
      */
     public boolean AESEncryptBlock(byte data[], short dataOffset, byte[] aesRoundKeys) {
        byte r;
        byte i;
        short keysOffset = 0;

        // *** ADD ROUND KEY
        //KeyAddition(data, dataOffset, roundKeys, (byte) 0);
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[i];

        // N_ROUNDS-1 ORDINARY ROUNDS
        for(r = 1; r < N_ROUNDS; r++) {
            keysOffset += STATELEN;

            // *** SUBSTITUTION
            //Substitution(data, dataOffset, SBox);
            for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] = SBox[((data[(short) (i + dataOffset)] >= 0) ? data[(short) (i + dataOffset)] : (short) (256 + data[(short) (i + dataOffset)]))] ;

            // *** SHIFT ROW
            ShiftRow(data, dataOffset, (byte) 0);

            // *** MIX COLUMN
            MixColumn(data, dataOffset);

            // *** ADD ROUND KEY
            // KeyAddition(data, dataOffset, roundKeys, (short) (r * STATELEN));
            for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[(short) (i + keysOffset)];
        }

        // *** NO MIXCOLUMN IN LAST ROUND

        // *** SUBSTITUTION
        //Substitution(data, dataOffset, SBox);
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] = SBox[((data[(short) (i + dataOffset)] >= 0) ? data[(short) (i + dataOffset)] : (short) (256 + data[(short) (i + dataOffset)]))] ;

        // *** SHIFT ROW
        ShiftRow(data, dataOffset, (byte) 0);

        // *** ADD ROUND KEY
        //KeyAddition(data, dataOffset, roundKeys, (short) (N_ROUNDS * STATELEN));
        keysOffset += STATELEN;
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[(short) (i + keysOffset)];

        return true;
     }

     /**
      * Decrypt one block, key schedule must be already processed
      * @param data
      * @param dataOffset
      * @param aesRoundKeys ... scheduled keys for AES (from RoundKeysSchedule() function)
      * @return true if decrypt success, false otherwise.
      */
     public boolean AESDecryptBlock(byte data[], short dataOffset, byte[] aesRoundKeys) {
        byte  r;
        short i;
        short keysOffset = 0;

        // *** ADD ROUND KEY
        //KeyAddition(data, dataOffset, roundKeys, (short) (N_ROUNDS * STATELEN));
        keysOffset = (short) (N_ROUNDS * STATELEN);
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[(short) (i + keysOffset)];

        // *** SHIFT ROW
        ShiftRow(data, dataOffset, (byte) 1);

        // *** SUBSTITUTION
        // Substitution(data, dataOffset, SiBox);
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] = SiBox[((data[(short) (i + dataOffset)] >= 0) ? data[(short) (i + dataOffset)] : (short) (256 + data[(short) (i + dataOffset)]))] ;

        for(r = (byte) (N_ROUNDS-1); r > 0; r--) {
            keysOffset -= STATELEN;

            // *** ADD ROUND KEY
            // KeyAddition(data, dataOffset, roundKeys, (short) (r * STATELEN));
            for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[(short) (i + keysOffset)];

            // *** INVERSE MIX COLUMN
            InvMixColumn(data, dataOffset);

            // *** SHIFT ROW
            ShiftRow(data, dataOffset, (byte) 1);

            // *** SUBSTITUTION
            // Substitution(data, dataOffset, SiBox);
            for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] = SiBox[((data[(short) (i + dataOffset)] >= 0) ? data[(short) (i + dataOffset)] : (short) (256 + data[(short) (i + dataOffset)]))] ;
        }

        // *** ADD ROUND KEY
        //KeyAddition(data, dataOffset, roundKeys, (byte) 0);
        for (i = 0; i < STATELEN; i++) data[(short) (i + dataOffset)] ^= aesRoundKeys[i];

        return true;
     }
}