/*
 * This file contains the core of a bitslice DES implementation for x86/MMX.
 * It is part of John the Ripper password cracker,
 * Copyright (c) 2000-2001,2005,2006,2008,2011 by Solar Designer
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
 *
 * Gate counts per S-box: 49 44 46 33 48 46 46 41
 * Average: 44.125
 *
 * The Boolean expressions corresponding to DES S-boxes have been generated
 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
 * John the Ripper password cracker: http://www.openwall.com/john/
 * Being mathematical formulas, they are not copyrighted and are free for reuse
 * by anyone.
 *
 * The x86/MMX code for the S-boxes was generated by Solar Designer using a
 * Perl script, hand-optimized, and then had its instruction scheduling tuned
 * on a Pentium 3 using a brute-force instruction scheduling program running
 * the individual S-boxes in a context similar to that of this file.
 *
 * The effort has been sponsored by Rapid7: http://www.rapid7.com
 *
 * Note: there's some MMX code in x86.S as well (just not for bitslice DES).
 */

#include "arch.h"

#if DES_BS_ASM

#ifdef UNDERSCORES
#define DES_bs_all			_DES_bs_all
#define DES_bs_init_asm			_DES_bs_init_asm
#define DES_bs_crypt			_DES_bs_crypt
#define DES_bs_crypt_25			_DES_bs_crypt_25
#define DES_bs_crypt_LM			_DES_bs_crypt_LM
#endif

/*
 * Some broken systems don't offer section alignments larger than 4 bytes,
 * while for the MMX code we need at least an 8 byte alignment. ALIGN_FIX
 * is here to work around this issue when we happen to get bad addresses.
 */
#ifndef ALIGN_FIX
#ifdef ALIGN_LOG
#define DO_ALIGN(log)			.align log
#else
#define DO_ALIGN(log)			.align 1 << log
#endif
#else
#ifdef ALIGN_LOG
#define DO_ALIGN(log)			.align log; .space 4
#else
#define DO_ALIGN(log)			.align 1 << log; .space 4
#endif
#endif

#ifdef __sun
/* Sun's assembler doesn't recognize .space */
#define DO_SPACE(size)			.zero size
#else
/* Mac OS X assembler doesn't recognize .zero */
#define DO_SPACE(size)			.space size
#endif

/* Sun's assembler can't multiply, but at least it can add... */
#define nptr(n)				n+n+n+n
#define nvec(n)				n+n+n+n+n+n+n+n

#ifdef BSD
.data
#else
.bss
#endif

.globl DES_bs_all
DO_ALIGN(5)
DES_bs_all:
DES_bs_all_KSp:
DO_SPACE(nptr(0x300))
DES_bs_all_KS_p:
DES_bs_all_KS_v:
DO_SPACE(nvec(0x300))
DES_bs_all_E:
DO_SPACE(nptr(96))
DES_bs_all_K:
DO_SPACE(nvec(56))
DES_bs_all_B:
DO_SPACE(nvec(64))
DES_bs_all_tmp:
DO_SPACE(nvec(16))
DES_bs_all_fields_not_used_here:
DO_SPACE(0x400 + 0x100 + 4 + 4 + 0x200)
DES_bs_all_possible_alignment_gaps:
DO_SPACE(0x100)

#define E(i)				DES_bs_all_E+nptr(i)
#define B(i)				DES_bs_all_B+nvec(i)
#define tmp_at(i)			DES_bs_all_tmp+nvec(i)

#define pnot				tmp_at(0)

#define S1(out1, out2, out3, out4) \
	movq %mm0,tmp_at(1); \
	movq %mm5,%mm7; \
	movq %mm4,tmp_at(4); \
	movq %mm2,%mm6; \
	movq %mm1,tmp_at(2); \
	por %mm2,%mm7; \
	movq %mm3,tmp_at(3); \
	pxor %mm0,%mm6; \
	movq %mm7,tmp_at(5); \
	movq %mm6,%mm1; \
	pandn %mm0,%mm4; \
	pand %mm7,%mm1; \
	movq %mm1,%mm7; \
	por %mm5,%mm7; \
	pxor %mm3,%mm1; \
	pxor %mm4,%mm3; \
	movq %mm1,tmp_at(6); \
	movq %mm3,%mm1; \
	pandn tmp_at(6),%mm3; \
	movq %mm3,tmp_at(7); \
	movq %mm5,%mm3; \
	por %mm0,%mm5; \
	pxor tmp_at(4),%mm3; \
	movq %mm3,tmp_at(8); \
	movq %mm5,%mm0; \
	pandn %mm3,%mm6; \
	pxor %mm2,%mm3; \
	pandn %mm2,%mm4; \
	pandn %mm1,%mm3; \
	pxor %mm3,%mm7; \
	movq tmp_at(7),%mm3; \
	pandn tmp_at(3),%mm5; \
	por %mm7,%mm0; \
	pandn %mm7,%mm3; \
	movq %mm3,tmp_at(9); \
	pand tmp_at(5),%mm7; \
	movq tmp_at(6),%mm3; \
	movq %mm0,%mm2; \
	pxor %mm1,%mm2; \
	pandn tmp_at(4),%mm3; \
	pandn %mm2,%mm4; \
	movq tmp_at(2),%mm2; \
	pxor %mm4,%mm7; \
	pxor tmp_at(8),%mm4; \
	pxor %mm3,%mm5; \
	por %mm3,%mm4; \
	pxor tmp_at(1),%mm4; \
	pxor %mm0,%mm3; \
	pandn %mm3,%mm2; \
	pxor tmp_at(5),%mm0; \
	movq tmp_at(7),%mm3; \
	por tmp_at(2),%mm3; \
	pxor pnot,%mm7; \
	pxor out1,%mm3; \
	pxor %mm7,%mm2; \
	pxor tmp_at(5),%mm4; \
	pxor out3,%mm2; \
	pxor %mm4,%mm7; \
	pxor %mm7,%mm3; \
	movq %mm3,out1; \
	por %mm6,%mm5; \
	por tmp_at(8),%mm7; \
	por %mm5,%mm0; \
	pxor out2,%mm7; \
	pxor %mm4,%mm0; \
	pxor %mm0,%mm7; \
	por tmp_at(4),%mm1; \
	movq tmp_at(2),%mm3; \
	pand tmp_at(9),%mm4; \
	pandn %mm1,%mm0; \
	pxor %mm0,%mm4; \
	por tmp_at(9),%mm3; \
	por tmp_at(2),%mm4; \
	movq %mm2,out3; \
	pxor %mm3,%mm7; \
	pxor %mm5,%mm4; \
	pxor out4,%mm4; \
	movq %mm7,out2; \
	movq %mm4,out4

#define S2(out1, out2, out3, out4) \
	movq %mm2,tmp_at(2); \
	movq %mm1,tmp_at(1); \
	movq %mm5,%mm2; \
	movq %mm4,tmp_at(4); \
	pandn %mm0,%mm2; \
	movq %mm3,tmp_at(3); \
	pandn %mm4,%mm2; \
	movq %mm0,%mm6; \
	movq %mm2,%mm7; \
	pxor pnot,%mm0; \
	por %mm1,%mm7; \
	pxor %mm4,%mm1; \
	movq %mm7,tmp_at(5); \
	pand %mm1,%mm6; \
	movq %mm5,%mm7; \
	pxor %mm4,%mm6; \
	pandn %mm1,%mm7; \
	movq %mm3,%mm4; \
	pxor %mm7,%mm2; \
	pandn %mm6,%mm7; \
	pxor %mm5,%mm1; \
	movq %mm7,tmp_at(7); \
	movq %mm5,%mm7; \
	pand tmp_at(2),%mm5; \
	pand tmp_at(5),%mm2; \
	movq %mm5,tmp_at(8); \
	pandn %mm2,%mm5; \
	pand tmp_at(2),%mm2; \
	movq tmp_at(8),%mm7; \
	pandn tmp_at(3),%mm5; \
	pandn %mm1,%mm7; \
	pxor %mm2,%mm0; \
	movq %mm7,%mm3; \
	pxor %mm0,%mm3; \
	pxor out2,%mm5; \
	pandn tmp_at(1),%mm7; \
	pxor %mm6,%mm7; \
	pxor %mm3,%mm5; \
	movq %mm7,%mm6; \
	movq %mm5,out2; \
	movq tmp_at(7),%mm5; \
	pandn tmp_at(5),%mm4; \
	pandn %mm0,%mm6; \
	pxor tmp_at(5),%mm3; \
	movq %mm1,%mm0; \
	pxor %mm4,%mm6; \
	pxor tmp_at(2),%mm0; \
	pxor %mm0,%mm6; \
	movq %mm0,%mm4; \
	pxor out1,%mm6; \
	pandn tmp_at(1),%mm0; \
	pxor tmp_at(4),%mm2; \
	pxor %mm3,%mm0; \
	movq %mm6,out1; \
	por %mm1,%mm3; \
	por tmp_at(8),%mm0; \
	pxor %mm4,%mm0; \
	movq %mm0,%mm4; \
	pandn tmp_at(2),%mm0; \
	movq tmp_at(3),%mm6; \
	pxor tmp_at(7),%mm0; \
	por %mm7,%mm0; \
	por %mm6,%mm5; \
	pxor %mm0,%mm2; \
	pandn %mm2,%mm7; \
	por %mm2,%mm6; \
	pxor out4,%mm7; \
	pxor %mm4,%mm6; \
	pxor out3,%mm6; \
	pxor %mm5,%mm7; \
	pxor %mm3,%mm7; \
	movq %mm6,out3; \
	movq %mm7,out4

#define S3(out1, out2, out3, out4) \
	movq %mm0,tmp_at(1); \
	movq %mm1,tmp_at(2); \
	movq %mm0,%mm7; \
	pandn %mm0,%mm1; \
	movq %mm2,tmp_at(3); \
	movq %mm5,%mm0; \
	pxor %mm2,%mm0; \
	movq %mm4,tmp_at(4); \
	movq %mm5,%mm2; \
	por %mm0,%mm1; \
	pxor %mm3,%mm2; \
	movq %mm0,%mm4; \
	movq %mm5,%mm6; \
	pandn %mm2,%mm7; \
	pxor tmp_at(2),%mm4; \
	movq %mm7,tmp_at(5); \
	pxor %mm1,%mm7; \
	pandn %mm4,%mm6; \
	movq %mm7,tmp_at(6); \
	pxor %mm6,%mm1; \
	pand %mm0,%mm2; \
	movq %mm1,%mm6; \
	movq %mm3,%mm0; \
	pandn %mm7,%mm6; \
	pand %mm5,%mm7; \
	pand %mm3,%mm5; \
	por %mm3,%mm7; \
	pand tmp_at(1),%mm7; \
	movq tmp_at(4),%mm3; \
	pandn tmp_at(6),%mm3; \
	pxor %mm4,%mm7; \
	pxor tmp_at(1),%mm0; \
	movq %mm7,tmp_at(7); \
	pxor %mm3,%mm7; \
	movq tmp_at(2),%mm3; \
	pxor out4,%mm7; \
	pxor %mm0,%mm1; \
	movq %mm7,out4; \
	movq tmp_at(3),%mm7; \
	por tmp_at(3),%mm1; \
	pandn %mm1,%mm2; \
	por tmp_at(5),%mm0; \
	movq %mm0,%mm1; \
	pandn %mm5,%mm3; \
	pandn tmp_at(7),%mm1; \
	por %mm4,%mm5; \
	pxor %mm3,%mm1; \
	por tmp_at(2),%mm7; \
	movq tmp_at(3),%mm3; \
	pandn %mm1,%mm3; \
	pxor %mm4,%mm0; \
	pandn %mm5,%mm3; \
	movq tmp_at(4),%mm5; \
	pxor tmp_at(1),%mm3; \
	pand %mm2,%mm5; \
	pxor pnot,%mm0; \
	pxor %mm5,%mm3; \
	movq %mm7,%mm5; \
	pxor out2,%mm3; \
	pandn tmp_at(4),%mm6; \
	pandn tmp_at(6),%mm7; \
	pxor %mm0,%mm6; \
	movq %mm3,out2; \
	pxor tmp_at(1),%mm2; \
	por tmp_at(4),%mm1; \
	por %mm2,%mm0; \
	pxor tmp_at(6),%mm5; \
	pxor %mm1,%mm0; \
	pxor out1,%mm6; \
	pxor out3,%mm5; \
	pxor tmp_at(7),%mm0; \
	pxor %mm7,%mm6; \
	pxor %mm5,%mm0; \
	movq %mm6,out1; \
	movq %mm0,out3

#define S4(out1, out2, out3, out4) \
	movq %mm1,%mm7; \
	pxor %mm2,%mm0; \
	por %mm3,%mm1; \
	pxor %mm4,%mm2; \
	movq %mm5,tmp_at(2); \
	pxor %mm4,%mm1; \
	movq %mm7,%mm6; \
	movq %mm7,%mm5; \
	pandn %mm2,%mm7; \
	pandn %mm2,%mm1; \
	por %mm7,%mm4; \
	pxor %mm3,%mm7; \
	movq %mm7,%mm6; \
	por %mm0,%mm7; \
	pxor %mm5,%mm3; \
	movq %mm1,tmp_at(3); \
	pandn %mm7,%mm1; \
	movq %mm1,%mm7; \
	pxor %mm5,%mm1; \
	pand %mm1,%mm6; \
	movq %mm6,%mm5; \
	pxor %mm1,%mm0; \
	pandn %mm2,%mm6; \
	pandn %mm0,%mm6; \
	pxor %mm0,%mm4; \
	movq %mm3,%mm0; \
	pandn %mm4,%mm3; \
	movq tmp_at(2),%mm2; \
	pxor %mm7,%mm3; \
	pxor tmp_at(3),%mm6; \
	movq %mm6,%mm7; \
	pandn %mm2,%mm6; \
	pxor out1,%mm6; \
	pandn %mm7,%mm2; \
	pxor out2,%mm2; \
	pxor %mm3,%mm6; \
	pxor pnot,%mm3; \
	pxor %mm3,%mm2; \
	pxor %mm7,%mm3; \
	movq %mm6,out1; \
	pandn %mm3,%mm0; \
	por %mm5,%mm0; \
	movq %mm2,out2; \
	movq tmp_at(2),%mm3; \
	por %mm1,%mm3; \
	pand tmp_at(2),%mm1; \
	pxor %mm4,%mm0; \
	pxor %mm0,%mm3; \
	pxor out3,%mm3; \
	pxor %mm1,%mm0; \
	movq %mm3,out3; \
	pxor out4,%mm0; \
	movq %mm0,out4

#define S5(out1, out2, out3, out4) \
	movq %mm2,tmp_at(3); \
	movq %mm0,tmp_at(1); \
	por %mm0,%mm2; \
	movq %mm5,%mm6; \
	movq %mm2,tmp_at(4); \
	pandn %mm2,%mm5; \
	movq %mm2,%mm7; \
	movq %mm5,%mm2; \
	pxor %mm0,%mm5; \
	movq %mm3,%mm7; \
	movq %mm5,tmp_at(5); \
	pxor tmp_at(3),%mm5; \
	movq %mm1,tmp_at(2); \
	por %mm5,%mm0; \
	por %mm3,%mm5; \
	pandn %mm2,%mm3; \
	pxor tmp_at(3),%mm3; \
	movq %mm3,tmp_at(6); \
	movq %mm0,%mm1; \
	pand %mm4,%mm3; \
	pxor %mm0,%mm3; \
	pand %mm7,%mm0; \
	pxor %mm7,%mm3; \
	movq %mm3,tmp_at(3); \
	pxor %mm3,%mm6; \
	movq %mm6,%mm2; \
	por tmp_at(5),%mm6; \
	movq %mm6,%mm3; \
	pand %mm4,%mm6; \
	movq %mm6,tmp_at(7); \
	pxor tmp_at(5),%mm6; \
	pxor %mm6,%mm0; \
	movq tmp_at(1),%mm6; \
	movq %mm0,tmp_at(8); \
	pandn %mm3,%mm6; \
	movq tmp_at(2),%mm0; \
	movq %mm6,%mm3; \
	pxor tmp_at(6),%mm6; \
	pxor %mm5,%mm4; \
	pandn %mm4,%mm6; \
	pxor pnot,%mm6; \
	pandn %mm6,%mm0; \
	pxor tmp_at(3),%mm0; \
	movq tmp_at(7),%mm6; \
	pandn tmp_at(6),%mm6; \
	pxor out3,%mm0; \
	pxor %mm4,%mm3; \
	movq %mm0,out3; \
	por tmp_at(8),%mm3; \
	movq tmp_at(6),%mm0; \
	pandn %mm3,%mm6; \
	pand tmp_at(6),%mm1; \
	pand %mm6,%mm2; \
	movq %mm6,%mm3; \
	pandn %mm5,%mm6; \
	pxor %mm4,%mm2; \
	por %mm2,%mm1; \
	pxor tmp_at(4),%mm3; \
	pxor tmp_at(7),%mm1; \
	pand %mm2,%mm7; \
	pand tmp_at(2),%mm1; \
	pxor tmp_at(1),%mm7; \
	pxor tmp_at(8),%mm1; \
	pxor %mm7,%mm3; \
	por tmp_at(2),%mm6; \
	pxor out4,%mm1; \
	movq %mm1,out4; \
	pxor %mm5,%mm0; \
	pxor tmp_at(5),%mm2; \
	pxor %mm3,%mm6; \
	pandn %mm0,%mm3; \
	pand tmp_at(2),%mm5; \
	pxor %mm2,%mm3; \
	pxor out2,%mm5; \
	pxor %mm5,%mm3; \
	pxor out1,%mm6; \
	movq %mm3,out2; \
	movq %mm6,out1

#define S6(out1, out2, out3, out4) \
	movq %mm4,tmp_at(2); \
	pxor %mm1,%mm4; \
	movq %mm5,tmp_at(3); \
	por %mm1,%mm5; \
	movq %mm2,%mm7; \
	pand %mm0,%mm5; \
	pxor %mm0,%mm2; \
	movq %mm0,tmp_at(1); \
	pxor %mm5,%mm4; \
	movq %mm4,tmp_at(4); \
	pxor tmp_at(3),%mm4; \
	movq %mm4,%mm6; \
	pandn tmp_at(2),%mm4; \
	pand %mm0,%mm6; \
	movq %mm6,tmp_at(5); \
	pxor %mm1,%mm6; \
	movq %mm6,tmp_at(6); \
	por %mm2,%mm6; \
	movq %mm6,tmp_at(7); \
	pxor tmp_at(4),%mm6; \
	movq %mm6,%mm0; \
	pand %mm7,%mm6; \
	movq %mm6,tmp_at(8); \
	movq tmp_at(3),%mm6; \
	por %mm1,%mm2; \
	pandn tmp_at(8),%mm6; \
	movq %mm6,tmp_at(9); \
	movq tmp_at(6),%mm6; \
	por %mm4,%mm6; \
	movq %mm6,tmp_at(6); \
	pxor tmp_at(9),%mm6; \
	movq %mm6,tmp_at(10); \
	pand %mm3,%mm6; \
	pxor out4,%mm6; \
	pxor %mm0,%mm6; \
	por tmp_at(1),%mm0; \
	movq %mm6,out4; \
	movq tmp_at(7),%mm6; \
	pxor %mm1,%mm6; \
	movq %mm3,%mm1; \
	movq %mm6,tmp_at(7); \
	pandn tmp_at(3),%mm6; \
	pxor %mm7,%mm6; \
	movq tmp_at(8),%mm7; \
	movq %mm6,tmp_at(12); \
	pandn tmp_at(2),%mm7; \
	pand tmp_at(6),%mm0; \
	por %mm6,%mm7; \
	pxor %mm6,%mm0; \
	movq tmp_at(9),%mm6; \
	por %mm3,%mm4; \
	pandn %mm0,%mm6; \
	por %mm7,%mm5; \
	pxor %mm4,%mm6; \
	pxor tmp_at(4),%mm0; \
	pxor out3,%mm6; \
	pxor %mm2,%mm5; \
	movq %mm6,out3; \
	movq tmp_at(5),%mm6; \
	pandn tmp_at(2),%mm0; \
	pxor pnot,%mm2; \
	pxor tmp_at(7),%mm2; \
	pxor tmp_at(3),%mm6; \
	pxor out2,%mm5; \
	movq tmp_at(12),%mm4; \
	pxor %mm2,%mm0; \
	pxor tmp_at(1),%mm4; \
	pxor tmp_at(10),%mm5; \
	pand %mm6,%mm4; \
	pandn %mm0,%mm3; \
	pxor out1,%mm4; \
	pandn %mm7,%mm1; \
	pxor tmp_at(8),%mm4; \
	pxor %mm2,%mm1; \
	pxor %mm3,%mm5; \
	movq %mm5,out2; \
	pxor %mm1,%mm4; \
	movq %mm4,out1

#define S7(out1, out2, out3, out4) \
	movq %mm0,tmp_at(1); \
	movq %mm4,tmp_at(3); \
	movq %mm4,%mm0; \
	pxor %mm3,%mm4; \
	movq %mm5,tmp_at(4); \
	movq %mm4,%mm7; \
	movq %mm3,tmp_at(2); \
	pxor %mm2,%mm4; \
	movq %mm4,tmp_at(5); \
	pand %mm5,%mm4; \
	movq %mm7,%mm5; \
	pxor tmp_at(4),%mm5; \
	pand %mm3,%mm7; \
	movq %mm7,tmp_at(6); \
	movq %mm7,%mm6; \
	pxor %mm1,%mm7; \
	pand tmp_at(4),%mm6; \
	pxor %mm2,%mm6; \
	movq %mm7,tmp_at(7); \
	movq tmp_at(1),%mm3; \
	movq %mm6,%mm0; \
	por %mm7,%mm6; \
	pand %mm4,%mm7; \
	pxor %mm5,%mm6; \
	pandn %mm3,%mm7; \
	pxor %mm4,%mm0; \
	pxor out4,%mm7; \
	pxor %mm5,%mm4; \
	pxor %mm6,%mm7; \
	movq %mm7,out4; \
	pandn tmp_at(2),%mm4; \
	por tmp_at(6),%mm6; \
	movq tmp_at(5),%mm7; \
	pandn tmp_at(3),%mm7; \
	pandn tmp_at(7),%mm4; \
	movq %mm7,tmp_at(9); \
	por tmp_at(7),%mm7; \
	pandn tmp_at(5),%mm5; \
	pxor %mm0,%mm7; \
	pxor tmp_at(3),%mm0; \
	pxor %mm4,%mm0; \
	movq tmp_at(1),%mm4; \
	pand %mm0,%mm2; \
	por %mm2,%mm6; \
	pxor %mm5,%mm6; \
	pandn %mm6,%mm3; \
	movq %mm6,%mm5; \
	pxor %mm7,%mm3; \
	pxor %mm6,%mm7; \
	por %mm0,%mm6; \
	pxor out1,%mm3; \
	pand tmp_at(4),%mm6; \
	pxor pnot,%mm5; \
	pand %mm6,%mm1; \
	pxor out3,%mm0; \
	pxor %mm7,%mm1; \
	movq %mm3,out1; \
	movq %mm4,%mm3; \
	pxor tmp_at(3),%mm7; \
	por %mm1,%mm2; \
	pxor %mm6,%mm2; \
	por %mm2,%mm7; \
	pand %mm7,%mm4; \
	pxor %mm6,%mm7; \
	por tmp_at(9),%mm7; \
	pxor %mm5,%mm7; \
	pxor out2,%mm1; \
	pandn %mm7,%mm3; \
	pxor %mm4,%mm0; \
	movq %mm0,out3; \
	pxor %mm3,%mm1; \
	movq %mm1,out2

#define S8(out1, out2, out3, out4) \
	movq %mm2,%mm7; \
	movq %mm1,tmp_at(1); \
	pandn %mm2,%mm1; \
	movq %mm2,tmp_at(2); \
	pandn %mm4,%mm2; \
	movq %mm5,tmp_at(5); \
	pxor %mm3,%mm2; \
	movq %mm4,tmp_at(4); \
	movq %mm1,%mm5; \
	movq %mm3,tmp_at(3); \
	movq %mm2,%mm4; \
	movq %mm2,%mm3; \
	pandn tmp_at(1),%mm4; \
	pand %mm0,%mm2; \
	pandn tmp_at(1),%mm7; \
	pandn %mm2,%mm1; \
	pxor tmp_at(4),%mm7; \
	movq %mm4,%mm6; \
	por %mm0,%mm4; \
	movq %mm7,tmp_at(6); \
	pand %mm4,%mm7; \
	pxor pnot,%mm3; \
	por %mm7,%mm2; \
	pxor %mm7,%mm3; \
	pandn tmp_at(2),%mm4; \
	movq tmp_at(5),%mm7; \
	pxor %mm4,%mm3; \
	por %mm1,%mm7; \
	pxor %mm3,%mm5; \
	pxor %mm5,%mm7; \
	pxor %mm0,%mm5; \
	pxor out2,%mm7; \
	movq %mm7,out2; \
	pxor tmp_at(1),%mm3; \
	movq %mm5,%mm4; \
	pand tmp_at(4),%mm5; \
	pxor %mm3,%mm5; \
	por tmp_at(3),%mm3; \
	pxor %mm5,%mm6; \
	pxor tmp_at(6),%mm3; \
	pxor %mm2,%mm5; \
	pxor %mm6,%mm3; \
	por tmp_at(1),%mm5; \
	pxor %mm3,%mm0; \
	pxor %mm4,%mm5; \
	por tmp_at(3),%mm4; \
	pxor tmp_at(4),%mm5; \
	pand tmp_at(5),%mm2; \
	pandn %mm5,%mm4; \
	pand tmp_at(5),%mm0; \
	pxor %mm6,%mm0; \
	por %mm1,%mm4; \
	pxor out4,%mm0; \
	pxor %mm4,%mm3; \
	pxor out3,%mm2; \
	por tmp_at(5),%mm3; \
	pxor out1,%mm3; \
	pxor %mm5,%mm2; \
	pxor %mm6,%mm3; \
	movq %mm0,out4; \
	movq %mm2,out3; \
	movq %mm3,out1

#define zero				%mm0

#define DES_bs_clear_block_8(i) \
	movq zero,B(i); \
	movq zero,B(i + 1); \
	movq zero,B(i + 2); \
	movq zero,B(i + 3); \
	movq zero,B(i + 4); \
	movq zero,B(i + 5); \
	movq zero,B(i + 6); \
	movq zero,B(i + 7)

#define DES_bs_clear_block \
	DES_bs_clear_block_8(0); \
	DES_bs_clear_block_8(8); \
	DES_bs_clear_block_8(16); \
	DES_bs_clear_block_8(24); \
	DES_bs_clear_block_8(32); \
	DES_bs_clear_block_8(40); \
	DES_bs_clear_block_8(48); \
	DES_bs_clear_block_8(56)

#define k_ptr				%edx
#define K(i)				nvec(i)(k_ptr)
#define k(i)				nptr(i)(k_ptr)

#define a1				%mm0
#define a2				%mm1
#define a3				%mm2
#define a4				%mm3
#define a5				%mm4
#define a6				%mm5

#define tmp1				%ecx
#define tmp2				%esi

#define xor_E(i) \
	movl E(i),tmp1; \
	movq K(i),a1; \
	movl E(i + 1),tmp2; \
	movq K(i + 1),a2; \
	pxor (tmp1),a1; \
	pxor (tmp2),a2; \
	movl E(i + 2),tmp1; \
	movq K(i + 2),a3; \
	movl E(i + 3),tmp2; \
	movq K(i + 3),a4; \
	pxor (tmp1),a3; \
	pxor (tmp2),a4; \
	movl E(i + 4),tmp1; \
	movq K(i + 4),a5; \
	movl E(i + 5),tmp2; \
	movq K(i + 5),a6; \
	pxor (tmp1),a5; \
	pxor (tmp2),a6

#define xor_B(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
	movq B(b1),a1; \
	movq B(b2),a2; \
	pxor K(k1),a1; \
	movq B(b3),a3; \
	pxor K(k2),a2; \
	movq B(b4),a4; \
	pxor K(k3),a3; \
	movq B(b5),a5; \
	pxor K(k4),a4; \
	movq B(b6),a6; \
	pxor K(k5),a5; \
	pxor K(k6),a6

#define xor_B_KS_p(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
	movl k(k1),tmp1; \
	movl k(k2),tmp2; \
	movq B(b1),a1; \
	movq B(b2),a2; \
	pxor (tmp1),a1; \
	movl k(k3),tmp1; \
	pxor (tmp2),a2; \
	movl k(k4),tmp2; \
	movq B(b3),a3; \
	movq B(b4),a4; \
	pxor (tmp1),a3; \
	movl k(k5),tmp1; \
	pxor (tmp2),a4; \
	movq B(b5),a5; \
	movl k(k6),tmp2; \
	movq B(b6),a6; \
	pxor (tmp1),a5; \
	pxor (tmp2),a6

.text

DO_ALIGN(5)
.globl DES_bs_init_asm
DES_bs_init_asm:
	pcmpeqd %mm0,%mm0
	movq %mm0,pnot
	ret

#define rounds_and_swapped		%ebp
#define iterations			%eax

DO_ALIGN(5)
.globl DES_bs_crypt
DES_bs_crypt:
	movl 4(%esp),iterations
	pxor zero,zero
	pushl %ebp
	pushl %esi
	movl $DES_bs_all_KS_v,k_ptr
	DES_bs_clear_block
	movl $8,rounds_and_swapped
DES_bs_crypt_start:
	xor_E(0)
	S1(B(40), B(48), B(54), B(62))
	xor_E(6)
	S2(B(44), B(59), B(33), B(49))
	xor_E(12)
	S3(B(55), B(47), B(61), B(37))
	xor_E(18)
	S4(B(57), B(51), B(41), B(32))
	xor_E(24)
	S5(B(39), B(45), B(56), B(34))
	xor_E(30)
	S6(B(35), B(60), B(42), B(50))
	xor_E(36)
	S7(B(63), B(43), B(53), B(38))
	xor_E(42)
	S8(B(36), B(58), B(46), B(52))
	cmpl $0x100,rounds_and_swapped
	je DES_bs_crypt_next
DES_bs_crypt_swap:
	xor_E(48)
	S1(B(8), B(16), B(22), B(30))
	xor_E(54)
	S2(B(12), B(27), B(1), B(17))
	xor_E(60)
	S3(B(23), B(15), B(29), B(5))
	xor_E(66)
	S4(B(25), B(19), B(9), B(0))
	xor_E(72)
	S5(B(7), B(13), B(24), B(2))
	xor_E(78)
	S6(B(3), B(28), B(10), B(18))
	xor_E(84)
	S7(B(31), B(11), B(21), B(6))
	xor_E(90)
	addl $nvec(96),k_ptr
	S8(B(4), B(26), B(14), B(20))
	decl rounds_and_swapped
	jnz DES_bs_crypt_start
	subl $nvec(0x300+48),k_ptr
	movl $0x108,rounds_and_swapped
	decl iterations
	jnz DES_bs_crypt_swap
	popl %esi
	popl %ebp
#ifdef EMMS
	emms
#endif
	ret
DES_bs_crypt_next:
	subl $nvec(0x300-48),k_ptr
	movl $8,rounds_and_swapped
	decl iterations
	jnz DES_bs_crypt_start
	popl %esi
	popl %ebp
#ifdef EMMS
	emms
#endif
	ret

DO_ALIGN(5)
.globl DES_bs_crypt_25
DES_bs_crypt_25:
	pxor zero,zero
	pushl %ebp
	pushl %esi
	movl $DES_bs_all_KS_v,k_ptr
	DES_bs_clear_block
	movl $8,rounds_and_swapped
	movl $25,iterations
DES_bs_crypt_25_start:
	xor_E(0)
	S1(B(40), B(48), B(54), B(62))
	xor_E(6)
	S2(B(44), B(59), B(33), B(49))
	xor_B(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
	S3(B(55), B(47), B(61), B(37))
	xor_B(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
	S4(B(57), B(51), B(41), B(32))
	xor_E(24)
	S5(B(39), B(45), B(56), B(34))
	xor_E(30)
	S6(B(35), B(60), B(42), B(50))
	xor_B(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
	S7(B(63), B(43), B(53), B(38))
	xor_B(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
	S8(B(36), B(58), B(46), B(52))
	cmpl $0x100,rounds_and_swapped
	je DES_bs_crypt_25_next
DES_bs_crypt_25_swap:
	xor_E(48)
	S1(B(8), B(16), B(22), B(30))
	xor_E(54)
	S2(B(12), B(27), B(1), B(17))
	xor_B(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
	S3(B(23), B(15), B(29), B(5))
	xor_B(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
	S4(B(25), B(19), B(9), B(0))
	xor_E(72)
	S5(B(7), B(13), B(24), B(2))
	xor_E(78)
	S6(B(3), B(28), B(10), B(18))
	xor_B(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
	S7(B(31), B(11), B(21), B(6))
	xor_B(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
	S8(B(4), B(26), B(14), B(20))
	addl $nvec(96),k_ptr
	decl rounds_and_swapped
	jnz DES_bs_crypt_25_start
	subl $nvec(0x300+48),k_ptr
	movl $0x108,rounds_and_swapped
	decl iterations
	jnz DES_bs_crypt_25_swap
	popl %esi
	popl %ebp
#ifdef EMMS
	emms
#endif
	ret
DES_bs_crypt_25_next:
	subl $nvec(0x300-48),k_ptr
	movl $8,rounds_and_swapped
	decl iterations
	jmp DES_bs_crypt_25_start

#define ones				%mm1

#define rounds				%eax

DO_ALIGN(5)
.globl DES_bs_crypt_LM
DES_bs_crypt_LM:
	pxor zero,zero
	pushl %esi
	pcmpeqd ones,ones
	movl $DES_bs_all_KS_p,k_ptr
	movq zero,B(0)
	movq zero,B(1)
	movq zero,B(2)
	movq zero,B(3)
	movq zero,B(4)
	movq zero,B(5)
	movq zero,B(6)
	movq zero,B(7)
	movq ones,B(8)
	movq ones,B(9)
	movq ones,B(10)
	movq zero,B(11)
	movq ones,B(12)
	movq zero,B(13)
	movq zero,B(14)
	movq zero,B(15)
	movq zero,B(16)
	movq zero,B(17)
	movq zero,B(18)
	movq zero,B(19)
	movq zero,B(20)
	movq zero,B(21)
	movq zero,B(22)
	movq ones,B(23)
	movq zero,B(24)
	movq zero,B(25)
	movq ones,B(26)
	movq zero,B(27)
	movq zero,B(28)
	movq ones,B(29)
	movq ones,B(30)
	movq ones,B(31)
	movq zero,B(32)
	movq zero,B(33)
	movq zero,B(34)
	movq ones,B(35)
	movq zero,B(36)
	movq ones,B(37)
	movq ones,B(38)
	movq ones,B(39)
	movq zero,B(40)
	movq zero,B(41)
	movq zero,B(42)
	movq zero,B(43)
	movq zero,B(44)
	movq ones,B(45)
	movq zero,B(46)
	movq zero,B(47)
	movq ones,B(48)
	movq ones,B(49)
	movq zero,B(50)
	movq zero,B(51)
	movq zero,B(52)
	movq zero,B(53)
	movq ones,B(54)
	movq zero,B(55)
	movq ones,B(56)
	movq zero,B(57)
	movq ones,B(58)
	movq zero,B(59)
	movq ones,B(60)
	movq ones,B(61)
	movq ones,B(62)
	movq ones,B(63)
	movl $8,rounds
DES_bs_crypt_LM_loop:
	xor_B_KS_p(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5)
	S1(B(40), B(48), B(54), B(62))
	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
	S2(B(44), B(59), B(33), B(49))
	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
	S3(B(55), B(47), B(61), B(37))
	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
	S4(B(57), B(51), B(41), B(32))
	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
	S5(B(39), B(45), B(56), B(34))
	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
	S6(B(35), B(60), B(42), B(50))
	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
	S7(B(63), B(43), B(53), B(38))
	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
	S8(B(36), B(58), B(46), B(52))
	xor_B_KS_p(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 36, 53)
	S1(B(8), B(16), B(22), B(30))
	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
	S2(B(12), B(27), B(1), B(17))
	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
	S3(B(23), B(15), B(29), B(5))
	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
	S4(B(25), B(19), B(9), B(0))
	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
	S5(B(7), B(13), B(24), B(2))
	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
	S6(B(3), B(28), B(10), B(18))
	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
	S7(B(31), B(11), B(21), B(6))
	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
	addl $nptr(96),k_ptr
	S8(B(4), B(26), B(14), B(20))
	decl rounds
	jnz DES_bs_crypt_LM_loop
	popl %esi
#ifdef EMMS
	emms
#endif
	ret

#endif

#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",@progbits
#endif
