/***************************************************************************
*
*                          AA      RRRRRRRR    CCCCCCC
*                         AAAA     RRRRRRRRR  CCCCCCCCC
*                        AAAAAA    RRR    RR  CCC
*                       AAA  AAA   RRRRRRRRR  CC
*                      AAA    AAA  RRR RRR    CCC
*                     AAA      AAA RRR  RRRR  CCCCCCCCC
*                    AAA        AAARRR   RRRR  CCCCCCC
*
*
*                Copyright 2009 ARC International Ltd
*                          San Jose, CA 95134
*                          All Rights Reserved
*
*  This program is the unpublished property and trade secret of ARC. It
*  is to be  utilized  solely  under  license  from  ARC  and it is to be
*  maintained on a confidential basis for internal company use only.  The
*  security  and  protection  of  the program is paramount to maintenance
*  of the trade secret status.  It is to  be  protected  from  disclosure
*  to unauthorized parties, both within the Licensee company and outside,
*  in a manner  not less stringent than  that utilized for Licensee's own
*  proprietary internal information.  No  copies of  the source or Object
*  Code are to leave the premises of Licensee's business except in strict
*  accordance with the license agreement signed by Licensee with ARC.
*
***************************************************************************/
/*
      Instruction Intrinsics

      Filename:   arc_intrinsics.h
      Version:    1.0
      History:    
        Oct-14-2009: Added the A_MX*, A_MY*, A_AX* and A_AY* register defintions

      Description:
        This file contains the definitions of various intrinsics that can be
        utilized in a C application, giving access to special instructions
        such as DSP instructions or Extended Accumulator instructions.
*/


#ifndef _ARC_INTRINSICS_H
#define _ARC_INTRINSICS_H 1

#ifdef __cplusplus
extern "C" {
#endif

#include "arc_reg.h"

/*
typedef unsigned int uint32 ;
typedef int int32;
typedef char uint8 ;
typedef unsigned short uint16 ;
*/



/*
 * AUX Register Symbols
 * (redefinition of symbols found in arc/arc_reg.h)
 */

#define AUX_XMACLW_H    0x9F /* r/w 32 32x16 accumulator (high) */
#define AUX_XMACLW_L    0xA0 /* r/w 32 32x16 accumulator (low) */

#pragma aux_register(AUX_XMACLW_H, name => "aux_xmaclw_h", effects => "%r56:is_written", side_effects => "w")
#pragma aux_register(AUX_XMACLW_L, name => "aux_xmaclw_l", effects => "%r57:is_written", side_effects => "w")


/*
 * X-Y Memory Symbols and Definitions
 * (some are redefinition of symbols found in arc/arc_reg.h)
 */

#if defined __Xxy && _ARCVER<0x40

#include "xyregs.h"

#define X0_u0   REG_X0_U0  // X0 with update 0
#define X0_u1   REG_X0_U1  // X0 with update 1
#define X0_nu   REG_X0_NU  // X0 without update
#define X1_u0   REG_X1_U0  // X1 with update 0
#define X1_u1   REG_X1_U1  // X1 with update 1
#define X1_nu   REG_X1_NU  // X1 without update
#define X2_u0   REG_X2_U0  // X2 with update 0
#define X2_u1   REG_X2_U1  // X2 with update 1
#define X2_nu   REG_X2_NU  // X2 without update
#define X3_u0   REG_X3_U0  // X3 with update 0
#define X3_u1   REG_X3_U1  // X3 with update 1
#define X3_nu   REG_X3_NU  // X3 without update

#define Y0_u0   REG_Y0_U0  // Y0 with update 0
#define Y0_u1   REG_Y0_U1  // Y0 with update 1
#define Y0_nu   REG_Y0_NU  // Y0 without update
#define Y1_u0   REG_Y1_U0  // Y1 with update 0
#define Y1_u1   REG_Y1_U1  // Y1 with update 1
#define Y1_nu   REG_Y1_NU  // Y1 without update
#define Y2_u0   REG_Y2_U0  // Y2 with update 0
#define Y2_u1   REG_Y2_U1  // Y2 with update 1
#define Y2_nu   REG_Y2_NU  // Y2 without update
#define Y3_u0   REG_Y3_U0  // Y3 with update 0
#define Y3_u1   REG_Y3_U1  // Y3 with update 1
#define Y3_nu   REG_Y3_NU  // Y3 without update

#define A_AX0  REG_AX0
#define A_AX1  REG_AX1
#define A_AX2  REG_AX2
#define A_AX3  REG_AX3
#define A_AY0  REG_AY0
#define A_AY1  REG_AY1
#define A_AY2  REG_AY2
#define A_AY3  REG_AY3

#define A_MX00 REG_MX00
#define A_MX01 REG_MX01
#define A_MX10 REG_MX10
#define A_MX11 REG_MX11
#define A_MX20 REG_MX20
#define A_MX21 REG_MX21
#define A_MX30 REG_MX30
#define A_MX31 REG_MX31
#define A_MY00 REG_MY00
#define A_MY01 REG_MY01
#define A_MY10 REG_MY10
#define A_MY11 REG_MY11
#define A_MY20 REG_MY20
#define A_MY21 REG_MY21
#define A_MY30 REG_MY30
#define A_MY31 REG_MY31

/* other burst related constants */
#define BURST_IN_PROGRESS  REG_XYCONFIG_BP_BIT
#define BURST_XMEM         0x0
#define BURST_YMEM         0x20000000
#define BSZ_BANK_POS       24
#define BSZ_XY_POS         29   /* not used */

/* Address Modifier Register Constants */
/* bit [31:30] AM addressing mode */
#define AM_MODULO          0x00000000
        /* Modulo addressing is selected when this constant is used.
           In this mode, bits 16 to 28 are used to define the modulo range.
           The address offset is specified using bits 0 to 13.
           This mode is also used for linear addressing, which is
           enabled by setting the modulo field to zero. */
#define AM_BITREVERSE      0x40000000
        /* Reverse-carry addressing is selected when this constant is used
           The address offset should be unsigned when this addressing mode
           is selected */
#define AM_USERDEFINED     0x80000000
        /* This mode allows the user to apply a special addressing mode to
           the address register value. This mode is provided by user
           implemented extension hardware for address calculation.
           Refer to Application Note #24 for more information. */
#define AM_RESERVED        0xC0000000
        /* This mode is reserved for future expansion. */

/* bit [29] H */
#define AM_16BIT           0x20000000
        /* half word (16-bit addressing mode)
           Bit 29 selects half word, a process where the address
           register value is shifted one place to the right
           (removing the LSB) before the address value is used
           to access memory to generate 16-bit data addressing.
           The LSB that is shifted out is used to determine which
           half of the 32-bit word should be processed. For a read
           operation, the 16-bit data read is always placed into
           the top half of the provided 32-bit word. The lower 16
           bits are cleared to zeros. In the case of a write operation,
           only the top 16 bits of the 32-bit word are written to
           memory. */
#define AM_MOD_POS         16
        /* bit position of MODULO part of Address Modifier Register */

/* bit [15] D */
#define AU_POST            0x0
#define AU_PRE             0x8000
        /* D - Dual addressing mode
           The dual mode is used with the 16-bit addressing mode.
           When enabled, the same 16-bit data is returned in both the lower
           and upper half of the 32-bit data word. This mode minimizes
           data memory use when the same data must be applied to both
           channels. */

#define AM_LINEAR          AM_MODULO   /* not used */
#define AM_ADDR_SHIFT      0x20000000  /* not used */

#endif /* __Xxy */


#ifdef __Xxmac_24

#define  TRUNC_CONST     0xFFFFFF00
/*
 * AUX_MACMODE modes
 */
#define CRandMSP     0x60000
#define CRandQandMSP 0x70000
#define QandMSP      0x30000
#define FClear       0x3

/*
   24-bit Multiply Accumulate Instructions
   ------------------------------------
   refer to
    "ARC(r) 600 DSPLib Reference"

opcode1		Instruction	Description
-------		-----------	---------------------
0x18		MULT		24x24-bit multiply without rounding
0x19		MULUT		Unsigned 24x24-bit multiply without rounding
0x1A		MULRT		24x24-bit multiply with rounding
				        (LSP write-back mod is not a useful operation)
0x1C		MACT		24x24-bit multiply-accumulate without rounding
0x1E		MACRT		24x24-bit multiply-accumulate with rounding
				        (LSP write-back mod is not a useful operation)
0x20		MSUBT		24x24-bit multiply-subtract without rounding

*/

extern int _mult(int, int);
#pragma intrinsic(_mult, name => "mult", latency_cycles => 2);
extern int _mult_f(int, int);
#pragma intrinsic(_mult_f, name => "mult", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

extern int _mulut(int, int);
#pragma intrinsic(_mulut, name => "mulut", latency_cycles => 2);
extern int _mulut_f(int, int);
#pragma intrinsic(_mulut_f, name => "mulut", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

extern int _mulrt(int, int);
#pragma intrinsic(_mulrt, name => "mulrt", latency_cycles => 2);
extern int _mulrt_f(int, int);
#pragma intrinsic(_mulrt_f, name => "mulrt", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

extern int _mact(int, int);
#pragma intrinsic(_mact, name => "mact", latency_cycles => 2);
extern int _mact_f(int, int);
#pragma intrinsic(_mact_f, name => "mact", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

extern int _macrt(int, int);
#pragma intrinsic(_macrt, name => "macrt", latency_cycles => 2);
extern int _macrt_f(int, int);
#pragma intrinsic(_macrt_f, name => "macrt", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

extern int _msubt(int, int);
#pragma intrinsic(_msubt, name => "msubt", latency_cycles => 2);
extern int _msubt_f(int, int);
#pragma intrinsic(_msubt_f, name => "msubt", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unchanged */

#endif /* __Xxmac_24 */


#ifdef __Xxmac_d16

#define  TRUNC_CONST     0xFFFF0000
/*
 * AUX_MACMODE modes
 */
#define CRandQandMSP 0x800c
#define QandMSP      0x0c
#define FClear       0x3

/*
   Multiply Accumulate Instructions (Dual 16x16)
   ---------------------------------------------
   refer to
    "ARC(r) 600 DSP Options V4.0 Reference"

opcode1		Instruction	Description
-------		-----------	---------------------
0x0A            vbfdw           Dual Viterbi Butterfly
0x0B            fbfdw           Dual-16-bit FFT butterfly accelerator
0x0C		MULDW		Dual 16x16-bit multiplication without rounding.
0x0D		MULUDW		Dual 16x16-bit multiplication unsigned
				        without rounding.
0x0E		MULRDW		Dual 16x16-bit multiplication with rounding.
0x10		MACDW		Dual 16x16-bit multiply-accumulate without
				        rounding.
0x11		MACUDW		Dual 16x16-bit multiply-accumulate unsigned
				        without rounding.
0x12		MACRDW		Dual 16x16-bit multiply-accumulate with
				        rounding
0x14		MSUBDW		Dual 16x16-bit multiply-subtract without
				        rounding.
0x26		CMACRDW		Partial complex multiplication (swaps
				        products and subtracts from A2) with
				        rounding
*/

extern int _muldw(int, int);
#pragma intrinsic(_muldw, name => "muldw", latency_cycles => 2);

extern int _muldw_f(int, int);
#pragma intrinsic(_muldw_f, name => "muldw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _muludw(int, int);
#pragma intrinsic(_muludw, name => "muludw", latency_cycles => 2);

extern int _muludw_f(int, int);
#pragma intrinsic(_muludw_f, name => "muludw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _mulrdw(int, int);
#pragma intrinsic(_mulrdw, name => "mulrdw", latency_cycles => 2);

extern int _mulrdw_f(int, int);
#pragma intrinsic(_mulrdw_f, name => "mulrdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _macdw(int, int);
#pragma intrinsic(_macdw, name => "macdw", latency_cycles => 2);

extern int _macdw_f(int, int);
#pragma intrinsic(_macdw_f, name => "macdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _macudw(int, int);
#pragma intrinsic(_macudw, name => "macudw", latency_cycles => 2);

extern int _macudw_f(int, int);
#pragma intrinsic(_macudw_f, name => "macudw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _macrdw(int, int);
#pragma intrinsic(_macrdw, name => "macrdw", latency_cycles => 2);

extern int _macrdw_f(int, int);
#pragma intrinsic(_macrdw_f, name => "macrdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _msubdw(int, int);
#pragma intrinsic(_msubdw, name => "msubdw", latency_cycles => 2);

extern int _msubdw_f(int, int);
#pragma intrinsic(_msubdw_f, name => "msubdw", latency_cycles => 2, set_flags => 1, flags => "znv");

extern int _cmacrdw(int, int);
#pragma intrinsic(_cmacrdw, name => "cmacrdw", latency_cycles => 2);

extern int _cmacrdw_f(int, int);
#pragma intrinsic(_cmacrdw_f, name => "cmacrdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _fbfdw(int);
#pragma intrinsic(_fbfdw, name => "fbfdw", latency_cycles => 2);

extern int _fbfdw_f(int);
#pragma intrinsic(_fbfdw_f, name => "fbfdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

#endif /* __Xxmac_d16 */

#if defined __Xdvbf || defined __Xvbfdw
extern int _vbfdw(int);
#pragma intrinsic(_vbfdw, name => "vbfdw", latency_cycles => 2);

extern int _vbfdw_f(int);
#pragma intrinsic(_vbfdw_f, name => "vbfdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */
#endif

#ifdef __Xcrc
extern int _crc(int,int);
#pragma intrinsic(_crc, name => "crc");
#endif

#ifdef __Xdsp_packa
/*
    DSPLib Accelerator Instructions
    -------------------------------
opcode1         Instruction     Description
-------         -----------     ---------------------
0x2C            crc             3-cycle variable polynomial CRC
0x21            asldw           Dual Word Arithmetic Shift Left
0x22            asrdw           Dual Word Arithmetic Shift Right
0x23            lsrdw           Dual Word Logical Shift Left
0x24            aslsdw          Dual Word Arithmetic Shift Left Saturating
0x25            asrsdw          Dual Word Arithmetic Shift Right Saturating
0x2B            maxabssdw       Dual Word Maxiumum Operation of Absolute Value
0x0f            maxidl          Dual Long Word MAX with Counter
0x09            minidl          Dual Long Word MIN with Counter

*/
extern int _asldw(int, int);
#pragma intrinsic(_asldw, name => "asldw", latency_cycles => 2); /* carry unaffected */

extern int _asldw_f(int, int);
#pragma intrinsic(_asldw_f, name => "asldw", latency_cycles => 2, set_flags => 1, flags => "znc"); /* v unaffected */

extern int _asrdw(int, int);
#pragma intrinsic(_asrdw, name => "asrdw", latency_cycles => 2); /* v unaffected */

extern int _asrdw_f(int, int);
#pragma intrinsic(_asrdw_f, name => "asrdw", latency_cycles => 2, set_flags => 1, flags => "znc"); /* v unaffected */

extern int _aslsdw(int, int);
#pragma intrinsic(_aslsdw, name => "aslsdw", latency_cycles => 2); /* carry unaffected */

extern int _aslsdw_f(int, int);
#pragma intrinsic(_aslsdw_f, name => "aslsdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _lsrdw(int, int);
#pragma intrinsic(_lsrdw, name => "lsrdw", latency_cycles => 2); 

extern int _lsrdw_f(int, int);
#pragma intrinsic(_lsrdw_f, name => "lsrdw", latency_cycles => 2, set_flags => 1, flags => "znc"); /* v unaffected */

extern int _asrsdw(int, int);
#pragma intrinsic(_asrsdw, name => "asrsdw", latency_cycles => 2); /* carry unaffected */

extern int _asrsdw_f(int, int);
#pragma intrinsic(_asrsdw_f, name => "asrsdw", latency_cycles => 2, set_flags => 1, flags => "znv"); /* carry unaffected */

extern int _maxabssdw(int, int);
#pragma intrinsic(_maxabssdw, name => "maxabssdw", latency_cycles => 2); 

extern int _maxabssdw_f(int, int);
#pragma intrinsic(_maxabssdw_f, name => "maxabssdw", latency_cycles => 2, set_flags => 1, flags => "zv");

extern int _maxidl(int, int);
#pragma intrinsic(_maxidl, name => "maxidl", latency_cycles => 2); 

extern int _maxidl_f(int, int);
#pragma intrinsic(_maxidl_f, name => "maxidl", latency_cycles => 2, set_flags => 1, flags => "zn");

extern int _minidl(int, int);
#pragma intrinsic(_minidl, name => "minidl", latency_cycles => 2); 

extern int _minidl_f(int, int);
#pragma intrinsic(_minidl_f, name => "minidl", latency_cycles => 2, set_flags => 1, flags => "zn");

#endif /* __Xdsp_packa */


#ifdef __Xmul32x16
/*
   Multiply Accumulate Instructions (__Xmul32x16)
   ----------------------------------------------
   refer to
    "ARC(r) 600 DSP Options V4.0 Reference"

opcode1		Instruction	Description
-------		-----------	---------------------
0x30		MULULW		Low part 32x32 MULU
0x31		MULLW		Low part 32x32 MUL
0x32		MULFLW		Low part 32x32 MULF
0x33		MACLW		Low part 32x32 MAC
0x34		MACFLW		Low part 32x32 MACF
0x35		MACHULW		High part 32x32 MULU/MACU
0x36		MACHLW		High part 32x32 MUL/MAC
0x37		MACHFLW		High part 32x32 MULF/MACF
0x38		MULHLW		Returns high part of 48-bit result to register.
0x39		MULHFLW		Returns high part of 48-bit result to register
			        with Q shift.

*/

extern int _mululw(int, int);
#pragma intrinsic(_mululw, name => "mululw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mululw_f(int, int);
#pragma intrinsic(_mululw_f, name => "mululw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mullw(int, int);
#pragma intrinsic(_mullw, name => "mullw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mullw_f(int, int);
#pragma intrinsic(_mullw_f, name => "mullw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mulflw(int src1, unsigned int src2);
#pragma intrinsic(_mulflw, name => "mulflw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mulflw_f(int src1, unsigned int src2);
#pragma intrinsic(_mulflw_f, name => "mulflw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _maclw(int src1, unsigned int src2);
#pragma intrinsic(_maclw, name => "maclw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _maclw_f(int src1, unsigned int src2);
#pragma intrinsic(_maclw_f, name => "maclw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _macflw(int, int);
#pragma intrinsic(_macflw, name => "macflw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _macflw_f(int, int);
#pragma intrinsic(_macflw_f, name => "macflw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machulw(int, int);
#pragma intrinsic(_machulw, name => "machulw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machulw_f(int, int);
#pragma intrinsic(_machulw_f, name => "machulw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machlw(int, int);
#pragma intrinsic(_machlw, name => "machlw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machlw_f(int, int);
#pragma intrinsic(_machlw_f, name => "machlw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machflw(int, int);
#pragma intrinsic(_machflw, name => "machflw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _machflw_f(int, int);
#pragma intrinsic(_machflw_f, name => "machflw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written:is_read;aux_xmaclw_l:is_written:is_read;r56:is_written;r57:is_written");

extern int _mulhlw(int, int);
#pragma intrinsic(_mulhlw, name => "mulhlw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mulhlw_f(int, int);
#pragma intrinsic(_mulhlw_f, name => "mulhlw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mulhflw(int, int);
#pragma intrinsic(_mulhflw, name => "mulhflw", latency_cycles => 2, effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

extern int _mulhflw_f(int, int);
#pragma intrinsic(_mulhflw_f, name => "mulhflw", latency_cycles => 2, set_flags => 1, flags => "znv", effects => "aux_xmaclw_h:is_written;aux_xmaclw_l:is_written;r56:is_written;r57:is_written");

#endif /* __Xmul32x16 */

#if defined __Xmpy
extern int _mpy(int, int);
extern unsigned _mpyu(unsigned, unsigned);
extern int _mpym(int, int);
extern unsigned _mpymu(unsigned, unsigned);
#pragma intrinsic(_mpy, name => "mpy", flags => "znv");
#pragma intrinsic(_mpyu, name => "mpyu", flags => "znv");
#pragma intrinsic(_mpym, name => "mpym", flags => "znv");
#pragma intrinsic(_mpymu, name => "mpymu", flags => "znv");
#endif /* __Xmpy */

/*
-----------------------------------------------
    __Xmpy16 - 16x16 integer multiply with
    32-bit result in any GPR.
-----------------------------------------------
*/
#if defined __Xmpy16
extern int _mpyw(int, int);
extern int _mpyw_f(int, int);
extern unsigned _mpyuw(unsigned, unsigned);
extern unsigned _mpyuw_f(unsigned, unsigned);
#pragma intrinsic(_mpyw, name => "mpyw", flags => "znv");
#pragma intrinsic(_mpyw_f, name => "mpyw", set_flags => 1, flags => "znv");
#pragma intrinsic(_mpyuw, name => "mpyuw", flags => "znv");
#pragma intrinsic(_mpyuw_f, name => "mpyuw", set_flags => 1, flags => "znv");
#endif /* __Xmpy16 */


/*
-----------------------------------------------
    __Xea - Extended Arithmetic
-----------------------------------------------
*/
#ifdef __Xea
/*
    ABSS, ADDSDW, ASLS, ASRS, DIVAW, NEGS, NEGSW, SUBS, SUBSDW

  opcode1         Instruction     Description
  -------         -----------     ---------------------
  0x02            SAT16           Saturation
  0x03            RND16           Two's complement rounding
  0x04            ABSSW           Asbolute word with Saturation
  0x05            ABSS            Absolute with Saturation
  0x06            NEGSW           Negate and saturate 16-bit input
  0x06            ADDS            Signed addition with Saturation
  0x07            SUBS            Subtract and saturate.
  0x07            NEGS            Negate and saturate 32-bit input
  0x08            DIVAW           Division assist.
  0x0A            ASLS            Arithmetic shift left and saturate.
                                  Supports negative shift values for
                                  right shift.
  0x0B            ASRS            Arithmetic shift right and saturate.
                                  Supports negative shift values for
                                  left shift.
  0x28            ADDSDW          Dual 16-bit Add and Saturate
  0x29            SUBSDW          Dual16-bit subtract and saturate.

 */

extern int _sat16(int);
#pragma intrinsic(_sat16, name => "sat16");

extern int _sat16_f(int);
#pragma intrinsic(_sat16_f, name => "sat16", set_flags => 1, flags => "znv");

extern int _rnd16(int);
#pragma intrinsic(_rnd16, name => "rnd16");

extern int _rnd16_f(int);
#pragma intrinsic(_rnd16_f, name => "rnd16", set_flags => 1, flags => "znv");
#endif

#if defined __Xea || (defined __Xdsp && _ARCVER >= 0x41)
extern int _abss(int);
#pragma intrinsic(_abss, name => "abss");
#endif

#ifdef __Xea

extern int _addsdw(int, int);
#pragma intrinsic(_addsdw, name => "addsdw");

#endif

#if (defined __Xea && defined __Xbarrel_shifter) ||  (defined __Xdsp && _ARCVER >= 0x41)
extern int _asls(int, int);
#pragma intrinsic(_asls, name => "asls");

extern int _asls_f(int, int);
#pragma intrinsic(_asls_f, name => "asls", set_flags => 1, flags => "znv");

extern int _asrs(int, int);
#pragma intrinsic(_asrs, name => "asrs");

extern int _asrs_f(int, int);
#pragma intrinsic(_asrs_f, name => "asrs", set_flags => 1, flags => "znv");

extern int _negs(int);
#pragma intrinsic(_negs, name => "negs");

extern int _negs_f(int);
#pragma intrinsic(_negs_f, name => "negs", set_flags => 1, flags => "znv");
#endif /* __Xbarrel_shifter */

#ifdef __Xea

extern int _divaw(int src1, int src2);
#pragma intrinsic(_divaw, name => "divaw");

extern int _negsw(int);
#pragma intrinsic(_negsw, name => "negsw");

extern int _negsw_f(int);
#pragma intrinsic(_negsw_f, name => "negsw", set_flags => 1, flags => "znv");

#endif

#if defined __Xea || (defined __Xdsp && _ARCVER >= 0x41)
extern int _adds(int, int);
#pragma intrinsic(_adds, name => "adds");

extern int _adds_f(int, int);
#pragma intrinsic(_adds_f, name => "adds", set_flags => 1, flags => "zcnv");

extern int _subs(int, int);
#pragma intrinsic(_subs, name => "subs");

extern int _subs_f(int, int);
#pragma intrinsic(_subs_f, name => "subs", set_flags => 1, flags => "zcnv");

#endif

#if defined __Xea

extern int _subsdw(int, int);
#pragma intrinsic(_subsdw, name => "subsdw");

#endif /* __Xea */

#if (defined __Xdsp && _ARCVER >= 0x41)
extern int _abssh(int);
#pragma intrinsic(_abssh, name => "abssh");

extern int _asrsr(int,int);
#pragma intrinsic(_asrsr, name => "asrsr", flags=> "znv");
extern int _asrsr_f(int,int);
#pragma intrinsic(_asrsr_f, name => "asrsr", set_flags=>1, flags=> "znv");


extern int _negsh(int);
#pragma intrinsic(_negsh, name => "negsh", flags=>"znv");
extern int _negsh_f(int);
#pragma intrinsic(_negsh_f, name => "negsh", set_flags=>1, flags=>"znv");

extern int _rndh(int);
#pragma intrinsic(_rndh, name => "rndh", flags=> "znv");
extern int _rndh_f(int);
#pragma intrinsic(_rndh_f, name => "rndh", set_flags=>1, flags=> "znv");

extern int _sath(int);
#pragma intrinsic(_sath, name => "sath", flags=> "znv");
extern int _sath_f(int);
#pragma intrinsic(_sath_f, name => "sath", set_flags=>1, flags=> "znv");

#endif


/*
 * SWAP
 */
#ifdef __Xswap
extern unsigned int _swap(unsigned int);
#pragma intrinsic(_swap, name => "swap");

extern unsigned int _swap_f(unsigned int);
#pragma intrinsic(_swap_f, name => "swap", set_flags => 1, flags => "zn");
#endif /* __Xswap */


/*
 * NORM
 */
#ifdef __Xnorm
extern int _normw(int);  // deprecated name
#pragma intrinsic(_normw, name => "normw");

extern int _normh(int);
#pragma intrinsic(_normh, name => "normw");

extern int _normw_f(int);  // deprecated name
#pragma intrinsic(_normw_f, name => "normw", set_flags => 1, flags => "zn");

extern int _normh_f(int);
#pragma intrinsic(_normh_f, name => "normw", set_flags => 1, flags => "zn");

extern int _norm(int);
#pragma intrinsic(_norm, name => "norm");

extern int _norm_f(int);
#pragma intrinsic(_norm_f, name => "norm", set_flags => 1, flags => "zn");
#endif /* __Xnorm */


/*
 * DMULPF
 */
/*
   Dual 32x16 MUL/MAC Instructions
   ------------------------------------

   opcode1		Instruction	Description
   -------		-----------	---------------------
   0x3A		DMULPF		Dual 32x16 multiply of signed XY data by
		   		        pseudo-floating point
   0x3B		DMACPF		Dual 32x16 multiply-accumulate of signed
				        XY data by pseudo-floating point
 */
#ifdef __Xdmulpf

extern int _dmulpf_xy(int, int);  // old name
#pragma intrinsic(_dmulpf_xy, name => "dmulpf", latency_cycles => 2, dmulpf => 1);

extern int _dmacpf_xy(int, int);   // old name
#pragma intrinsic(_dmacpf_xy, name => "dmacpf", latency_cycles => 2, dmulpf => 1);

extern int _dmulpf(int, int);
#pragma intrinsic(_dmulpf, name => "dmulpf", latency_cycles => 2, dmulpf => 1);

extern int _dmulpf_f(int, int);
#pragma intrinsic(_dmulpf_f, name => "dmulpf", dmulpf=>1, set_flags => 1, flags => "zn");

extern int _dmacpf(int, int);
#pragma intrinsic(_dmacpf, name => "dmacpf", latency_cycles => 2, dmulpf => 1);

extern int _dmacpf_f(int, int);
#pragma intrinsic(_dmacpf_f, name => "dmacpf", dmulpf=>1, set_flags => 1, flags => "zn");

#endif /* __Xdmulpf */

#if _ARCVER >= 0x30  // ARC 700 and later

    // "value" is stored into the given pointer "ip".
    // The original content of the pointer is returned.
    // This operation happens atomically.
    extern int _ex(int value, int *ip);
    #pragma intrinsic(_ex, name => "ex");

    #ifdef __CCAC__   // mcc can't handle _Uncached arguments to intrinsics
	// "value" is stored into the given pointer "ip",
	// bypassing the cache.
	// The original (uncached) content of the pointer is returned.
	// This operation happens atomically.
	extern int _ex_di(int value, _Uncached int *ip);
	#pragma intrinsic(_ex_di, name => "ex_di");
    #endif
   
    // swape in ARC700 base case.
    // In av2 if SWAP option enabled.
    #if _ARCVER < 0x40 || (_ARCVER >= 0x41 && defined __Xswap)
	extern int _swape(int x);
	#pragma intrinsic(_swape, name => "swape");
    #endif

    // llock, scond in ARC 700 base case.
    // In av2 if ATOMIC option enabled.
    #if _ARCVER < 0x40 || (_ARCVER >= 0x41 && defined __Xatomic)
	// Load the content of the given pointer, setting
	// the LP to 1 and LPA to the value of "lockAddress" if successful.
	extern int _llock(int *lockAddress);
	#pragma intrinsic(_llock, name => "llock");

	#ifdef __CCAC__   // mcc can't handle _Uncached arguments to intrinsics
	    // Same as _llock but bypasses the memory cache.
	    extern int _llock_di(_Uncached int *lock);
	    #pragma intrinsic(_llock_di, name => "llock_di");
	#endif

 	// Store "value" into the given "lockAddress" if the LF bit
	// set by a preceding _llock call is set.
	// Returns 0 if successful; non-zero if failed.
        extern int _scond(int value, int *lockAddress);
	#pragma intrinsic(_scond, name => "scond");

	#ifdef __CCAC__   // mcc can't handle _Uncached arguments to intrinsics
	    // Same as _scond but bypasses the memory cache.
	    extern int _scond_di(int value, _Uncached int *lockAddress);
	    #pragma intrinsic(_scond_di, name => "scond_di");
	#endif

	#if (_ARCVER >= 0x51)
	  #ifdef __Xll64
	    // Load the content of the given pointer, setting
	    // the LP to 1 and LPA to the value of "lockAddress" if successful.
	    extern long long _llockd(long long *lockAddress);
	    #pragma intrinsic(_llockd, name => "llockd");

	    // Same as _llock but bypasses the memory cache.
	    extern long long _llockd_di(_Uncached long long *lock);
	    #pragma intrinsic(_llockd_di, name => "llockd_di");

	    // Store "value" into the given "lockAddress" if the LF bit
	    // set by a preceding _llock call is set.
	    // Returns 0 if successful; non-zero if failed.
	    extern int _scondd(long long value, long long *lockAddress);
	    #pragma intrinsic(_scondd, name => "scondd");

	    // Same as _scondd but bypasses the memory cache.
	    extern int _scondd_di(long long value, _Uncached long long *lockAddress);
	    #pragma intrinsic(_scondd_di, name => "scondd_di");
	  #endif

	  extern void _wlfc(unsigned c);
	  #pragma intrinsic(_wlfc, name => "wlfc");
	#endif

    #endif
#endif

#if _ARCVER >= 0x40  
    // assign given value to the kernel STATUS32 register
    extern void _kflag(unsigned statusValue);
    #pragma intrinsic(_kflag, name => "kflag");
#endif



// av2 em 1.1
#if  _ARCVER >= 0x41
    #if defined __Xnorm
	extern int _ffs(int x);
	#pragma intrinsic(_ffs, name => "ffs");

	extern int _fls(int x);
	#pragma intrinsic(_fls, name => "fls");
    #endif

    // Assign "value" into the given AUX register.
    // Returns the original value in the AUX register.
    // This operation happens atomically.
    extern int _aex(int value, unsigned auxRegister);
    #pragma intrinsic(_aex, name => "aex");

    #define __PRAGMA(x) _Pragma(#x)

    #define _modif(x) __builtin_modif(x)

    #define __X2(n) \
    extern unsigned _##n(unsigned,unsigned); \
    __PRAGMA(intrinsic(_##n , name=> #n)); 

    #define __X1(n) \
    extern unsigned _##n(unsigned); \
    __PRAGMA(intrinsic(_##n , name=> #n)); 

    #define __AUX(r,n) \
       __PRAGMA(aux_register(r,name=>#n));

    #if defined __Xdsp && defined __CCAC__
        __AUX(ACC0_LO,acc0_lo)
        __AUX(ACC0_GLO,acc0_glo)
        __AUX(ACC0_HI,acc0_hi)
        __AUX(ACC0_GHI,acc0_ghi)
        __AUX(DSP_BFLY,dsp_bfly)
        __AUX(DSP_FFT_CTRL,dsp_fft_ctrl)
        __AUX(DSP_CTRL,dsp_ctrl)
        __AUX(DSP_BUILD,dsp_build)
	__X2(valgn2h)
	__X1(vext2bhl)
	__X1(vext2bhm)
	#ifdef __Xdsp2
	    __X1(vext2bhlf) 
	    __X1(vext2bhmf)
	    __X1(vpack2hbl)
	    __X1(vpack2hblf)
	    __X1(vpack2hbm)
	    __X1(vpack2hbmf)
	    __X2(vpack2hl)
	    __X2(vpack2hm)
	    __X2(vperm)
	#endif
	__X1(vrep2hl)
	__X1(vrep2hm)
	__X1(vsext2bhl)
	__X1(vsext2bhm)

	__X1(vabs2h)
	__X1(vabss2h)
	__X2(vadd2h)
	__X2(vadd4b)
	__X2(vadds2h)
	__X2(vaddsub2h)
	__X2(vaddsubs2h)
	__X2(vasl2h)
	__X2(vasls2h)
	__X2(vasr2h)
	__X2(vasrs2h)
	__X2(vasrsr2h)
	__X2(vlsr2h)
	__X2(vmax2h)
	__X2(vmin2h)
	__X1(vneg2h)
	__X1(vnegs2h)
	__X1(vnorm2h)
	__X2(vsub2h)
	__X2(vsub4b)
	__X2(vsubadd2h)
	__X2(vsubadds2h)
	__X2(vsubs2h)
	#if __HIGHC__
	    // MCC requires explicit side-effects; ccac has them builtin
	    #define __ACC_R_EFFECTS , assume_volatile=>1, effects=>"%acc0_lo:is_read;%acc0_hi:is_read"
	    #define __ACC_W_EFFECTS , assume_volatile=>1, effects=>"%acc0_lo:is_written;%acc0_hi:is_written"
	    #define __ACC_RW_EFFECTS , assume_volatile=>1, effects=>"%acc0_lo:is_read:is_written;%acc0_hi:is_read:is_written"
	#else
	    #define __ACC_R_EFFECTS 
	    #define __ACC_W_EFFECTS
	    #define __ACC_RW_EFFECTS 
	#endif

        #define __ACCR(n) \
	extern int _##n(int); \
	__PRAGMA(intrinsic(_##n , name=> #n __ACC_R_EFFECTS)); 

        #define __ACCRW(n) \
	extern void _##n(int); \
	__PRAGMA(intrinsic(_##n , name=> #n __ACC_RW_EFFECTS));

        #define __ACCW0(n) \
	extern void _##n(void); \
	__PRAGMA(intrinsic(_##n , name=> #n __ACC_RW_EFFECTS));

	__ACCRW(aslacc)
	__ACCRW(aslsacc)
	__ACCR(getacc)
	__ACCR(normacc)
        #ifndef __Xdsp2
	    __ACCRW(sqrtacc)
	    __ACCRW(divacc)
	#endif

        #if defined(__Xdsp2) && (defined(__Xdsp_divsqrt_radix2) || defined(__Xdsp_divsqrt_radix4))
	extern int _sqrt(int); \
	__PRAGMA(intrinsic(_sqrt , name=> "sqrt" __ACC_W_EFFECTS, flags=>"z"));
	extern int _sqrtf(int); \
	__PRAGMA(intrinsic(_sqrtf , name=> "sqrtf" __ACC_W_EFFECTS, flags=>"z"));
	#endif

	extern int _setacc(int,int); \
	__PRAGMA(intrinsic(_setacc , name=> "setacc" __ACC_W_EFFECTS));

	extern void _flagacc(int c);
	#pragma intrinsic(_flagacc, name=>"flagacc", assume_volatile=>1, \
		effects=>"%status:is_written;%acc0_glo:is_read;%acc0_ghi:is_read" )

	#define __MAC_NO_FLAGS(n,ReturnType,ArgType) \
	    extern ReturnType _##n(ArgType,ArgType); \
	    __PRAGMA(intrinsic(_##n, name=>#n __ACC_RW_EFFECTS));
	#define __MPY_OP_NO_FLAGS(n,op,ReturnType,ArgType) \
	    extern ReturnType _##n(ArgType,ArgType); \
	    __PRAGMA(intrinsic(_##n, name=>#n __ACC_W_EFFECTS));
	#define __MPY_NO_FLAGS(n,ReturnType,ArgType)  __MPY_OP_NO_FLAGS(n,n,ReturnType,ArgType)

	#define __MAC_OP(n,op,ReturnType,ArgType,Flags) \
	    extern ReturnType _##n(ArgType,ArgType); \
	    __PRAGMA(intrinsic(_##n, name=>#n __ACC_RW_EFFECTS, flags=>#Flags));
	#define __MAC(n,type,Flags)  __MAC_OP(n,n,type,type,Flags)
	#define __MACD(n,type,Flags)  __MAC_OP(n,n,long long,type,Flags)
	#define __MPY_OP(n,op,returnType,argType,Flags) \
	    extern returnType _##n(argType,argType); \
	    __PRAGMA(intrinsic(_##n, name=>#op __ACC_W_EFFECTS, flags=>#Flags));
	#define __MPY(n,type,Flags)  __MPY_OP(n,n,type,type,Flags)
	#define __MPYD(n,type,Flags)  __MPY_OP(n,n,long long,type,Flags)

	__MAC_NO_FLAGS(vmac2h,long long, unsigned)
	__MAC_NO_FLAGS(vmac2hf,unsigned, unsigned)
	__MAC_NO_FLAGS(vmac2hfr,unsigned, unsigned)
	__MAC_NO_FLAGS(vmac2hnfr,unsigned, unsigned)
	__MAC_NO_FLAGS(vmac2hu,unsigned long long, unsigned)
//	__MPY_NO_FLAGS(vmpy2h,  long long, unsigned)       // elsewhere
	__MPY_NO_FLAGS(vmpy2hf,  unsigned, unsigned)
	__MPY_NO_FLAGS(vmpy2hfr,  unsigned, unsigned)
//	__MPY_NO_FLAGS(vmpy2hu,  long long, unsigned)     // elsewhere
	__MPY_NO_FLAGS(vmpy2hwf, long long, unsigned)
	__MAC_NO_FLAGS(vmsub2hf, unsigned, unsigned)
	__MAC_NO_FLAGS(vmsub2hfr, unsigned, unsigned)
	__MAC_NO_FLAGS(vmsub2hnfr, unsigned, unsigned)

//	__MAC(dmach, unsigned, nv)     		//elsewhere
	__MAC(dmachf, unsigned, znv)
	__MAC(dmachfr, unsigned, znv)
//	__MAC(dmachu, dmachu, unsigned, v)      //elsewhere
//	__MPY(dmpyh, unsigned, nv)   		//elsewhere
	__MPY(dmpyhf, unsigned, znv)
        __MPY(dmpyhwf, unsigned, znv)
	__MPY(dmpyhfr, unsigned, znv)
//	__MPY(dmpyhu, unsigned, v)

//	__MAC(mac, mac, int, zn)
//	__MACD(macd, macd, int, znv)
	__MACD(macdf,int,znv)
//	__MACD(macdu,unsigned,v)
	__MAC(macf,int,vnz)
	__MAC(macfr,int,vnz)
//	__MAC(macu,unsigned, v)

	__MPY(mpy,int,znv)
//	__MPYD(mpyd,int,znv)
	__MPYD(mpydf,int,znv)
//	__MPYD(mpydu,int,zv)
	__MPY(mpyf,int,znv)
	__MPY(mpyfr,int,znv)
	__MPY(mpym,int,znv)
	__MPY(mpymu,unsigned,znv)
	__MPY(mpyu,unsigned,znv)
//	__MPY(mpyuw,unsigned, znv) 
//	__MPY(mpyw, int,znv)  

	__MACD(msubdf,int,znv)
	__MAC(msubf,int,znv)
	__MAC(msubfr,int,znv)

        #ifdef __Xdsp2
	__MAC(macwhfl,int,znv)
	__MAC(macwhflr,int,znv)
	__MAC(macwhkl,int,nv)
	__MAC(macwhkul,unsigned,nv)
	#endif
	__MAC(macwhfm,int,znv)
	__MAC(macwhfmr,int,znv)
	__MAC(macwhl,int,nv)
	__MAC(macwhul,unsigned,v)

	__MPY(mpywhfl,int,znv)
	__MPY(mpywhflr,int,znv)
	__MPY(mpywhfm,int,znv)
	__MPY(mpywhfmr,int,znv)
        #ifdef __Xdsp2
	__MPY(mpywhkl,int,znv)
	__MPY(mpywhkul,unsigned,znv)
	#endif
	__MPY(mpywhl,int,znv)
	__MPY(mpywhul,int,znv)

        #ifdef __Xdsp2
	__MAC(msubwhfl,int,znv)
	__MAC(msubwhflr,int,znv)
	__MAC(msubwhfm,int,znv) 
	__MAC(msubwhfmr,int,znv)
	#endif

	__MAC_OP(dmachbl,dmachbl,int,unsigned,nv)  
	__MAC_OP(dmachbm,dmachbm,int,unsigned,nv) 
	__MPY_OP(dmpyhbl,dmpyhbl,int,unsigned,nv)
	__MPY_OP(dmpyhbm,dmpyhbm,int,unsigned,nv)

        #ifdef __Xdsp2
	    #ifdef __Xdiv_rem
		__MPY(divf,int,znv)
	    #endif
	#endif

    #endif

    // Xbitstream is EM extension typically used with DSP
    #if defined(__CCAC__) && defined(__Xbitstream)
        __X1(bspeek)
        __X1(bspop)
        __X2(bspush)
    #endif

    #if defined __CCAC__ && (defined(__Xdsp_complex) || defined(__Xdsp2))
	#define __CB1(n) \
	    extern unsigned _##n(unsigned); \
	    __PRAGMA(intrinsic(_##n, name=>#n));
	#define __CB2(n) \
	    extern unsigned _##n(unsigned,unsigned); \
	    __PRAGMA(intrinsic(_##n, name=>#n));
	#define __CBMPY(n) \
	    extern unsigned _##n(unsigned,unsigned); \
	    __PRAGMA(intrinsic(_##n, name=>#n));

        #if defined __Xdsp_complex
	    __CB2(cbflyhf0r)
	    __CB1(cbflyhf1r)
        #endif

	__CB2(cmacchnfr)
	__CB2(cmacchfr)
	__CB2(cmachnfr)
	__CB2(cmachfr)
	__CB2(cmpychnfr)
	__CB2(cmpychfr)
	__CBMPY(cmpyhnfr)
	__CBMPY(cmpyhfmr)
	__CBMPY(cmpyhfr)
	#undef __CB1
	#undef __CB2
	#undef __CBMPY
    #endif

    #undef __MPY
    #undef __MAC
    #undef __MAC_OP
    #undef __MPY_NO_FLAGS
    #undef __MPY_OP_NO_FLAGS
    #undef __MAC_NO_FLAGS
    #undef __ACCRW
    #undef __ACCR
    #undef __X2
    #undef __X1
    #undef __PRAGMA
    
    #ifdef __Xdsp2
    extern int _adcs(int, int);
    #pragma intrinsic(_adcs, name => "adcs", assume_volatile=>1)

    extern int _sbcs(int, int);
    #pragma intrinsic(_sbcs, name => "sbcs", assume_volatile=>1)

    extern int _satf(int);
    #pragma intrinsic(_satf, name => "satf", assume_volatile=>1)
    #endif

#endif

/* 
 * AV2HS DSP-like instructions
 */
#if defined(__Xmpy_option) 
#   if __Xmpy_option >= 7

	//NOTE: each operand and the result is a pair of 16-bit values.
        extern int _mac(int,int);
        extern unsigned _macu(unsigned,unsigned);
        extern int _dmach(unsigned ,unsigned );
        extern unsigned _dmachu(unsigned ,unsigned );
	extern int _dmpyh(unsigned a, unsigned b);
	extern unsigned _dmpyhu(unsigned a, unsigned b);
	#pragma intrinsic(_mac,name=>"mac")
	#pragma intrinsic(_macu,name=>"macu")
	#pragma intrinsic(_dmach,name=>"dmach")
	#pragma intrinsic(_dmachu,name=>"dmachu")
	#pragma intrinsic(_dmpyh,name=>"dmpyh")
	#pragma intrinsic(_dmpyhu,name=>"dmpyhu")

	extern unsigned _vadd2h(unsigned b, unsigned c);
	extern unsigned _vaddsub2h(unsigned b, unsigned c);
	extern unsigned _vsub2h(unsigned b, unsigned c);
	extern unsigned _vsubadd2h(unsigned b, unsigned c);

	#pragma intrinsic(_vadd2h,name=>"vadd2h")
	#pragma intrinsic(_vaddsub2h,name=>"vaddsub2h")
	#pragma intrinsic(_vsub2h,name=>"vsub2h")
	#pragma intrinsic(_vsubadd2h,name=>"vsubadd2h")
#   endif
#   if __Xmpy_option >= 8
        extern long long _macd(int,int);
        extern unsigned long long _macdu(unsigned,unsigned);
        extern long long _mpyd(int,int);
        extern unsigned long long _mpydu(unsigned,unsigned);
	// NOTE: "b" and "c" is pair of haldwords
	extern unsigned long long _vmpy2h(unsigned b,unsigned c);
	extern unsigned long long _vmpy2hu(unsigned b,unsigned c);

	#pragma intrinsic(_macd,name=>"macd")
	#pragma intrinsic(_macdu,name=>"macdu")
	#pragma intrinsic(_mpyd,name=>"mpyd")
	#pragma intrinsic(_mpydu,name=>"mpydu")
	#pragma intrinsic(_vmpy2h,name=>"vmpy2h")
	#pragma intrinsic(_vmpy2hu,name=>"vmpy2hu")
#   endif
#   if __Xmpy_option >= 9 
        // "a" and result is pair of 32-bit words
	// "c" is a pair of shorts
        extern long long _dmacwh(unsigned long long b,unsigned long c);
        extern unsigned long long _dmacwhu(unsigned long long b, unsigned long c);
        extern long long _dmpywh(unsigned long long b, unsigned long c);
        extern unsigned long long _dmpywhu(unsigned long long b, unsigned long c);

	// The result, "b", and "c" are each 4 halfwords
        extern long long _qmach(unsigned long long b, unsigned long long c);
        extern unsigned long long _qmachu(unsigned long long b, unsigned long long c);
        extern long long _qmpyh(unsigned long long b, unsigned long long c);
        extern unsigned long long _qmpyhu(unsigned long long b, unsigned long long c);
        extern unsigned long long _vadd4h(unsigned long long b, unsigned long long c);
        extern unsigned long long _vaddsub4h(unsigned long long b, unsigned long long c);
        extern unsigned long long _vsub4h(unsigned long long b, unsigned long long c);
        extern unsigned long long _vsubadd4h(unsigned long long b, unsigned long long c);

	// The result, "b" and "c" are each a pair of words
	extern unsigned long long _vadd2(unsigned long long b, unsigned long long c);
	extern unsigned long long _vaddsub(unsigned long long b, unsigned long long c);
	extern unsigned long long _vsubadd(unsigned long long b, unsigned long long c);
	extern unsigned long long _vsub2(unsigned long long b, unsigned long long c);

	#pragma intrinsic(_dmacwh,name=>"dmacwh")
	#pragma intrinsic(_dmacwhu,name=>"dmacwhu")
	#pragma intrinsic(_dmpywh,name=>"dmpywh")
	#pragma intrinsic(_dmpywhu,name=>"dmpywhu")
	#pragma intrinsic(_qmach,name=>"qmach")
	#pragma intrinsic(_qmachu,name=>"qmachu")
	#pragma intrinsic(_qmpyh,name=>"qmpyh")
	#pragma intrinsic(_qmpyhu,name=>"qmpyhu")
	#pragma intrinsic(_vadd2,name=>"vadd2")
	#pragma intrinsic(_vadd4h,name=>"vadd4h")
	#pragma intrinsic(_vaddsub,name=>"vaddsub")
	#pragma intrinsic(_vaddsub4h,name=>"vaddsub4h")
	#pragma intrinsic(_vsub2,name=>"vsub2")
	#pragma intrinsic(_vsub4h,name=>"vsub4h")
	#pragma intrinsic(_vsubadd,name=>"vsubadd")
	#pragma intrinsic(_vsubadd4h,name=>"vsubadd4h")
#   endif
#endif


#if _ARCVER >= 0x51  // av2hs core 1
     extern void _dsync(void);
     #pragma intrinsic(_dsync, name=>"dsync", assume_volatile=>1)

     #if 0 // TODO: need to be able to limit the argument to a constant
     extern void _dmb(unsigned typeMask); // Operand must be a constant, a 3-bit mask
     #pragma intrinsic(_dmb, name=>"dmb", assume_volatile=>1)
     #endif

     extern void _wevt(unsigned c);
     #pragma intrinsic(_wevt, name=>"wevt", assume_volatile=>1)

#endif

#if 0 // TODO: need way to limit argument to a constant
extern void _trap(unsigned c); // argument must be 6-bit constant
#pragma intrinsic(_trap, name=>"trap_s", assume_volatile=>1)
#endif


/*
   Arithmetic Intrinsics for the instructions:
   ABS, ADC, ADD, ADD1, ADD2, ADD3, ADDS, EXTB, EXTW, MAX, MIN,
   MOV, NEG, RND16, SAT16, SBC, SUB1, SUB2, SUB3, RSUB, SEXB, SEXW, SUB

*/

extern int _add(int, int);
#pragma intrinsic(_add, name => "add");

extern int _add_f(int, int);
#pragma intrinsic(_add_f, name => "add", set_flags => 1, flags => "zcnv");

extern int _add1(int, int);
#pragma intrinsic(_add1, name => "add1");

extern int _add1_f(int, int);
#pragma intrinsic(_add1_f, name => "add1", set_flags => 1, flags => "zcnv");

extern int _add2(int, int);
#pragma intrinsic(_add2, name => "add2");

extern int _add2_f(int, int);
#pragma intrinsic(_add2_f, name => "add2", set_flags => 1, flags => "zcnv");

extern int _add3(int, int);
#pragma intrinsic(_add3, name => "add3");

extern int _add3_f(int, int);
#pragma intrinsic(_add3_f, name => "add3", set_flags => 1, flags => "zcnv");

//#ifdef __Xmin_max ... base case for ARCompact
//extern int _max(int, int);
//#pragma intrinsic(_max, name => "max");
// ...commented out... Compiler already defines max() */

extern int _max_f(int, int);
#pragma intrinsic(_max_f, name => "max", set_flags => 1, flags => "zcnv");

//extern int _min(int, int);
//#pragma intrinsic(_min, name => "min");
// ...commented out... Compiler already defines min() */

extern int _min_f(int, int);
#pragma intrinsic(_min_f, name => "min", set_flags => 1, flags => "zcnv");
//#endif /* __Xmin_max */

//extern int _mov(int, int);
//#pragma intrinsic(_mov, name => "mov", assume_volatile => 1);
// _mov not needed ... use _core_read() and _core_write instead

extern int _mov_f(int);
#pragma intrinsic(_mov_f, name => "mov", set_flags => 1, flags => "zn", assume_volatile => 1);

extern int _sub1(int, int);
#pragma intrinsic(_sub1, name => "sub1");

extern int _sub1_f(int, int);
#pragma intrinsic(_sub1_f, name => "sub1", set_flags => 1, flags => "zcnv");

extern int _sub2(int, int);
#pragma intrinsic(_sub2, name => "sub2");

extern int _sub2_f(int, int);
#pragma intrinsic(_sub2_f, name => "sub2", set_flags => 1, flags => "zcnv");

extern int _sub3(int, int);
#pragma intrinsic(_sub3, name => "sub3");

extern int _sub3_f(int, int);
#pragma intrinsic(_sub3_f, name => "sub3", set_flags => 1, flags => "zcnv");

extern int _sub(int, int);
#pragma intrinsic(_sub, name => "sub");

extern int _sub_f(int, int);
#pragma intrinsic(_sub_f, name => "sub", set_flags => 1, flags => "zcnv");


/*
   Other Base Case ALU instructions
   AND, OR, BSET, BCLR, BTST, BXOR, BMSK,
   ASL, ASR, LSR, ROR, RRC
 */

extern unsigned int _and(unsigned int, int);
#pragma intrinsic(_and, name => "and");

extern unsigned int _and_f(unsigned int, int);
#pragma intrinsic(_and_f, name => "and", set_flags => 1, flags => "zn");

extern unsigned int _or(unsigned int, int);
#pragma intrinsic(_or, name => "or");

extern unsigned int _or_f(unsigned int, int);
#pragma intrinsic(_or_f, name => "or", set_flags => 1, flags => "zn");

extern int _bset(int, int);
#pragma intrinsic(_bset, name => "bset");

extern int _bset_f(int, int);
#pragma intrinsic(_bset_f, name => "bset", set_flags => 1, flags => "zn");

extern int _bclr(int, int);
#pragma intrinsic(_bclr, name => "bclr");

extern int _bclr_f(int, int);
#pragma intrinsic(_bclr_f, name => "bclr", set_flags => 1, flags => "zn");

extern int _btst_f(int, int);
#pragma intrinsic(_btst_f, name => "btst", set_flags => 1, flags => "zn");

extern int _bxor(int, int);
#pragma intrinsic(_bxor, name => "bxor");

extern int _bxor_f(int, int);
#pragma intrinsic(_bxor_f, name => "bxor", set_flags => 1, flags => "zn");

extern int _bmsk(int, int);
#pragma intrinsic(_bmsk, name => "bmsk");

extern int _bmsk_f(int, int);
#pragma intrinsic(_bmsk_f, name => "bmsk", set_flags => 1, flags => "zn");

extern int _asl(int, int);
#pragma intrinsic(_asl, name => "asl");

extern int _asl_f(int, int);
#pragma intrinsic(_asl_f, name => "asl", set_flags => 1, flags => "zcnv");

extern int _asr(int, int);
#pragma intrinsic(_asr, name => "asr");

extern int _asr_f(int, int);
#pragma intrinsic(_asr_f, name => "asr", set_flags => 1, flags => "zcn");

extern int _lsr(int, int);
#pragma intrinsic(_lsr, name => "lsr");

extern int _lsr_f(int, int);
#pragma intrinsic(_lsr_f, name => "lsr", set_flags => 1, flags => "zcn");

extern unsigned int _ror(unsigned int, int);
#pragma intrinsic(_ror, name => "ror");

extern unsigned int _ror_f(unsigned int, int);
#pragma intrinsic(_ror_f, name => "ror", set_flags => 1, flags => "zcn");

extern int _rrc(int);
#pragma intrinsic(_rrc, name => "rrc");

extern int _rrc_f(int);
#pragma intrinsic(_rrc_f, name => "rrc", set_flags => 1, flags => "zcn");

#ifdef __Xfpus_div
    extern float _fssqrt(float);
    #pragma intrinsic(_fssqrt, name => "fssqrt");
#endif

#ifdef __Xfpud_div
    extern double _fdsqrt(double);
    #pragma intrinsic(_fdsqrt, name => "fdsqrt");
#endif

#ifdef __Xfastmath_sat
    extern int _fmp_adds(int, int);
    #pragma intrinsic(_fmp_adds, name => "fmp_adds");

    extern int _fmp_rndh(int);
    #pragma intrinsic(_fmp_rndh, name => "fmp_rndh");

    extern int _fmp_sath(int);
    #pragma intrinsic(_fmp_sath, name => "fmp_sath");
#endif

#ifdef __Xfastmath_div
    extern int _fmp_divf(int, int);
    #pragma intrinsic(_fmp_divf, name => "fmp_divf");

    extern int _fmp_divf15(int, int);
    #pragma intrinsic(_fmp_divf15, name => "fmp_divf15");

    extern int _fmp_recip(int);
    #pragma intrinsic(_fmp_recip, name => "fmp_recip");

    extern int _fmp_recip15(int);
    #pragma intrinsic(_fmp_recip15, name => "fmp_recip15");
#endif

#ifdef __Xfastmath_sqrt
    extern int _fmp_sqrtf(int);
    #pragma intrinsic(_fmp_sqrtf, name => "fmp_sqrtf");

    extern int _fmp_sqrtf15(int);
    #pragma intrinsic(_fmp_sqrtf15, name => "fmp_sqrtf15");
#endif

#ifdef __Xfastmath_trig
    extern int _fmp_cos(int);
    #pragma intrinsic(_fmp_cos, name => "fmp_cos");

    extern int _fmp_cos15(int);
    #pragma intrinsic(_fmp_cos15, name => "fmp_cos15");

    extern int _fmp_sin(int);
    #pragma intrinsic(_fmp_sin, name => "fmp_sin");

    extern int _fmp_sin15(int);
    #pragma intrinsic(_fmp_sin15, name => "fmp_sin15");

    extern int _fmp_atan(int);
    #pragma intrinsic(_fmp_atan, name => "fmp_atan");

    extern int _fmp_atan15(int);
    #pragma intrinsic(_fmp_atan15, name => "fmp_atan15");

    extern int _fmp_exp2(int);
    #pragma intrinsic(_fmp_exp2, name => "fmp_exp2");

    extern int _fmp_exp215(int);
    #pragma intrinsic(_fmp_exp215, name => "fmp_exp215");

    extern int _fmp_log2(int);
    #pragma intrinsic(_fmp_log2, name => "fmp_log2");

    extern int _fmp_log215(int);
    #pragma intrinsic(_fmp_log215, name => "fmp_log215");
#endif /* __Xfastmath_trig  */

#ifdef __cplusplus
}
#endif

#endif /* _ARC_INTRINSICS_H */

