forked from Imagelibrary/rtems
2009-12-01 Till Straumann <strauman@slac.stanford.edu>
* Makefile.am, mpc6xx/altivec: new directory implementing support for AltiVec context saving/restoring.
This commit is contained in:
@@ -1,3 +1,8 @@
|
||||
2009-12-01 Till Straumann <strauman@slac.stanford.edu>
|
||||
|
||||
* Makefile.am, mpc6xx/altivec: new directory implementing
|
||||
support for AltiVec context saving/restoring.
|
||||
|
||||
2009-12-01 Till Straumann <strauman@slac.stanford.edu>
|
||||
|
||||
* mpc6xx/mmu/bat.c, mpc6xx/mmu/pte121.c: skip data-
|
||||
|
||||
@@ -241,7 +241,14 @@ noinst_PROGRAMS += mpc6xx/timer.rel
|
||||
mpc6xx_timer_rel_SOURCES = mpc6xx/timer/timer.c
|
||||
mpc6xx_timer_rel_CPPFLAGS = $(AM_CPPFLAGS)
|
||||
mpc6xx_timer_rel_LDFLAGS = $(RTEMS_RELLDFLAGS)
|
||||
|
||||
# mpc6xx/altivec
|
||||
noinst_PROGRAMS += mpc6xx/altivec.rel
|
||||
mpc6xx_altivec_rel_SOURCES = mpc6xx/altivec/vec_sup.c mpc6xx/altivec/vec_sup_asm.S
|
||||
mpc6xx_altivec_rel_CPPFLAGS = $(AM_CPPFLAGS)
|
||||
mpc6xx_altivec_rel_LDFLAGS = $(RTEMS_RELLDFLAGS)
|
||||
endif
|
||||
EXTRA_DIST += mpc6xx/altivec/README
|
||||
|
||||
if e500
|
||||
# mpc6xx/clock
|
||||
|
||||
184
c/src/lib/libcpu/powerpc/mpc6xx/altivec/README
Normal file
184
c/src/lib/libcpu/powerpc/mpc6xx/altivec/README
Normal file
@@ -0,0 +1,184 @@
|
||||
RTEMS ALTIVEC SUPPORT
|
||||
=====================
|
||||
|
||||
1. History
|
||||
----------
|
||||
|
||||
Altivec support was developed and maintained as a user-extension
|
||||
outside of RTEMS. This extension is still available (unbundled)
|
||||
from Till Straumann <strauman@slac.stanford.edu>; it is useful
|
||||
if an application desires 'lazy switching' of the altivec context.
|
||||
|
||||
2. Modes
|
||||
--------
|
||||
|
||||
Altivec support -- the unbundled extension, that is -- can be used
|
||||
in two ways:
|
||||
|
||||
a. All tasks are implicitly AltiVec-enabled.
|
||||
|
||||
b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching'
|
||||
is implemented to switch AltiVec the context.
|
||||
|
||||
Note that the code implemented in this directory supports mode 'a'
|
||||
and mode 'a' ONLY. For mode 'b' you need the unbundled extension
|
||||
(which is completely independent of this code).
|
||||
|
||||
Mode 'a' (All tasks are AltiVec-enabled)
|
||||
- - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
The major disadvantage of this mode is that additional overhead is
|
||||
involved: tasks that never use the vector unit still save/restore
|
||||
the volatile vector registers (20 registers * 16bytes each) across
|
||||
every interrupt and all non-volatile registers (12 registers * 16b each)
|
||||
during every context switch.
|
||||
|
||||
However, saving/restoring e.g., the volatile registers is quite
|
||||
fast -- on my 1GHz 7457 saving or restoring 20 vector registers
|
||||
takes only about 1us or even less (if there are cache hits).
|
||||
|
||||
The advantage is complete transparency to the user and full ABI
|
||||
compatibility (exept for ISRs and exception handlers), see below.
|
||||
|
||||
Mode 'b' (Only dedicated tasks are AltiVec-enabled)
|
||||
- - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
The advantage of this mode of operation is that the vector-registers
|
||||
are only saved/restored when a different, altivec-enabled task becomes
|
||||
ready to run. In particular, if there is only a single altivec-enabled
|
||||
task then the altivec-context *never* is switched.
|
||||
|
||||
Note that this mode of operation is not supported by the code
|
||||
in this directory -- you need the unbundled altivec extension
|
||||
mentioned above.
|
||||
|
||||
3. Compiler Options
|
||||
-------------------
|
||||
|
||||
Three compiler options affect AltiVec: -maltivec, -mabi=altivec and
|
||||
-mvrsave=yes/no.
|
||||
|
||||
-maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables
|
||||
gcc to emit vector instructions. Note that gcc may use the
|
||||
AltiVec engine implicitly, i.e., **without you writing any
|
||||
vectorized code**.
|
||||
|
||||
-mabi=altivec: This option has two effects:
|
||||
i) It ensures 16-byte stack alignment required by AltiVec
|
||||
(even in combination with eabi which is RTEMS' default).
|
||||
ii) It allows vector arguments to be passed in vector registers.
|
||||
|
||||
-mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register
|
||||
indicating which vector registers are 'currently in use'.
|
||||
Because the altivec support does not use this information *) the
|
||||
option has no direct affect but it is desirable to compile with
|
||||
-mvrsave=no so that no unnecessary code is generated.
|
||||
|
||||
*) The file vec_sup_asm.S conditionally disables usage of
|
||||
the VRSAVE information if the preprocessor symbol
|
||||
'IGNORE_VRSAVE' is defined, which is the default.
|
||||
|
||||
If 'IGNORE_VRSAVE' is undefined then the code *does*
|
||||
use the VRSAVE information but I found that this does
|
||||
not execute noticeably faster.
|
||||
|
||||
IMPORTANT NOTES
|
||||
===============
|
||||
|
||||
AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes
|
||||
which is NOT enough for AltiVec (which requires 16-byte alignment).
|
||||
|
||||
There are two ways for obtaining 16-byte alignment:
|
||||
|
||||
I) Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment)
|
||||
II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment
|
||||
but also allows for passing vector arguments in vector registers)
|
||||
|
||||
Note that it is crucial to compile ***absolutely everything*** with the same
|
||||
ABI options (or a linker error may occur). In particular, this includes
|
||||
|
||||
- newlibc multilib variant
|
||||
- RTEMS proper
|
||||
- application + third-party code
|
||||
|
||||
IMO the proper compiler options for Mode 'a' would be
|
||||
|
||||
-maltivec -mabi=altivec -mvrsave=no
|
||||
|
||||
Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec
|
||||
but leaves -mvrsave at some 'default' which is probably 'no'.
|
||||
Compiling with -mvrsave=yes does not produce incompatible code but
|
||||
may have a performance impact (since extra code is produced to maintain
|
||||
VRSAVE).
|
||||
|
||||
4. Multilib Variants
|
||||
--------------------
|
||||
|
||||
The default GCC configuration for RTEMS contains a -mcpu=7400 multilib
|
||||
variant which is the correct one to choose.
|
||||
|
||||
5. BSP 'custom' file.
|
||||
---------------------
|
||||
|
||||
Now that you have the necessary newlib and libgcc etc. variants
|
||||
you also need to build RTEMS accordingly.
|
||||
|
||||
In you BSP's make/custom/<bsp>.cfg file make sure the CPU_CFLAGS
|
||||
select the desired variant:
|
||||
|
||||
for mode 'a':
|
||||
|
||||
CPU_CFLAGS = ... -mcpu=7400
|
||||
|
||||
Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially
|
||||
enables code that takes care of switching the AltiVec context as necessary.
|
||||
This is transparent to application code.
|
||||
|
||||
6. BSP support
|
||||
--------------
|
||||
|
||||
It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE
|
||||
during early boot, ideally before any C-code is executed (because it
|
||||
may, theoretically, use vector instructions).
|
||||
|
||||
The BSP must
|
||||
|
||||
- set MSR_VE
|
||||
- clear VRSAVE; note that the probing algorithm for detecting
|
||||
whether -mvrsave=yes or 'no' was used relies on the BSP
|
||||
clearing VRSAVE during early start. Since no interrupts or
|
||||
context switches happen before the AltiVec support is initialized
|
||||
clearing VRSAVE is no problem even if it turns out that -mvrsave=no
|
||||
was in effect (eventually a value of all-ones will be stored
|
||||
in VRSAVE in this case).
|
||||
- clear VSCR
|
||||
|
||||
7. PSIM note
|
||||
------------
|
||||
|
||||
PSIM supports the AltiVec instruction set with the exception of
|
||||
the 'data stream' instructions for cache prefetching. The RTEMS
|
||||
altivec support includes run-time checks to skip these instruction
|
||||
when executing on PSIM.
|
||||
|
||||
Note that AltiVec support within PSIM must be enabled at 'configure'
|
||||
time by passing the 'configure' option
|
||||
|
||||
--enable-sim-float=altivec
|
||||
|
||||
Note also that PSIM's AltiVec support has many bugs. It is recommended
|
||||
to apply the patches filed as an attachment with gdb bug report #2461
|
||||
prior to building PSIM.
|
||||
|
||||
The CPU type and corresponding multilib must be changed when
|
||||
building RTEMS/psim:
|
||||
|
||||
edit make/custom/psim.cfg and change
|
||||
|
||||
CPU_CFLAGS = ... -mcpu=603e
|
||||
|
||||
to
|
||||
|
||||
CPU_CFLAGS = ... -mcpu=7400
|
||||
|
||||
This change must be performed *before* configuring RTEMS/psim.
|
||||
228
c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c
Normal file
228
c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/* $Id$ */
|
||||
|
||||
/* Altivec support for RTEMS; vector register context management.
|
||||
* This is implemented as a user extension.
|
||||
*
|
||||
* Author: Till Straumann <strauman@slac.stanford.edu>, 2005
|
||||
*/
|
||||
|
||||
#ifdef __ALTIVEC__
|
||||
|
||||
#include <rtems.h>
|
||||
#include <libcpu/cpuIdent.h>
|
||||
#include <rtems/bspIo.h>
|
||||
#include <rtems/error.h>
|
||||
#include <rtems/score/cpu.h>
|
||||
#include <rtems/powerpc/powerpc.h>
|
||||
|
||||
#define STATIC static
|
||||
|
||||
#define VEC_ALIGNMENT 16
|
||||
|
||||
#define NAM "AltiVec Support"
|
||||
#define ERRID(a,b,c,d) (((a)<<24) | ((b)<<16) | ((c)<<8) | (d))
|
||||
|
||||
typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT)));
|
||||
|
||||
#ifndef MSR_VE
|
||||
#define MSR_VE (1<<(31-6))
|
||||
#endif
|
||||
|
||||
/* NOTE: These two variables are accessed by assembly code
|
||||
* which assumes 32-bit data!
|
||||
*/
|
||||
uint32_t _CPU_altivec_ctxt_off = 0;
|
||||
uint32_t _CPU_altivec_psim_cpu = 0;
|
||||
|
||||
static inline uint32_t
|
||||
mfmsr(void)
|
||||
{
|
||||
uint32_t v;
|
||||
_CPU_MSR_GET(v);
|
||||
return v;
|
||||
}
|
||||
|
||||
static inline void
|
||||
mtmsr(uint32_t v)
|
||||
{
|
||||
_CPU_MSR_SET(v);
|
||||
}
|
||||
|
||||
static inline void
|
||||
isync(void)
|
||||
{
|
||||
asm volatile("isync");
|
||||
}
|
||||
|
||||
static inline void
|
||||
dssall(void)
|
||||
{
|
||||
if ( !_CPU_altivec_psim_cpu)
|
||||
asm volatile("dssall");
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
set_MSR_VE(void)
|
||||
{
|
||||
uint32_t rval;
|
||||
rval=mfmsr();
|
||||
if ( ! (MSR_VE & rval ) ) {
|
||||
mtmsr(rval | MSR_VE);
|
||||
isync();
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
static inline void
|
||||
clr_MSR_VE(void)
|
||||
{
|
||||
dssall();
|
||||
mtmsr(mfmsr() & ~MSR_VE);
|
||||
isync();
|
||||
}
|
||||
|
||||
static inline void
|
||||
rst_MSR_VE(uint32_t old)
|
||||
{
|
||||
if ( ! ( MSR_VE & old ) ) {
|
||||
dssall();
|
||||
mtmsr(old);
|
||||
isync();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Code to probe the compiler's stack alignment (PowerPC);
|
||||
* The routine determines at run-time if the compiler generated
|
||||
* 8 or 16-byte aligned code.
|
||||
*
|
||||
* Till Straumann <strauman@slac.stanford.edu>, 2005
|
||||
*/
|
||||
|
||||
static void dummy(void) __attribute__((noinline));
|
||||
/* add (empty) asm statement to make sure this isn't optimized away */
|
||||
static void dummy(void) { asm volatile(""); }
|
||||
|
||||
static unsigned probe_r1(void) __attribute__((noinline));
|
||||
static unsigned probe_r1(void)
|
||||
{
|
||||
unsigned r1;
|
||||
/* call something to enforce creation of a minimal stack frame;
|
||||
* (8 bytes: r1 and lr space for 'dummy' callee). If compiled
|
||||
* with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi
|
||||
* or -maltivec / -mabi=altivec then gcc allocates 16 bytes
|
||||
* according to the sysv / altivec ABI specs.
|
||||
*/
|
||||
dummy();
|
||||
/* return stack pointer */
|
||||
asm volatile("mr %0,1":"=r"(r1));
|
||||
return r1;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
probe_ppc_stack_alignment(void)
|
||||
{
|
||||
unsigned r1;
|
||||
asm volatile("mr %0,1":"=r"(r1));
|
||||
return (r1 - probe_r1()) & ~ 0xf;
|
||||
}
|
||||
|
||||
STATIC int check_stack_alignment(void)
|
||||
{
|
||||
int rval = 0;
|
||||
if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) {
|
||||
printk(NAM": CPU support has unsufficient stack alignment;\n");
|
||||
printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n");
|
||||
printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n");
|
||||
rval |= 1;
|
||||
}
|
||||
/* Run-time check; should compile with -mabi=altivec */
|
||||
if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) {
|
||||
printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n");
|
||||
rval |= 2;
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
|
||||
static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline));
|
||||
|
||||
/* Check if this code was compiled with -mvrsave=yes or no
|
||||
* so that we can set the default/init value accordingly.
|
||||
*/
|
||||
static uint32_t probe_vrsave(_vu32 *p_v)
|
||||
{
|
||||
_vu32 x;
|
||||
uint32_t vrsave;
|
||||
/* Explicitly clobber a volatile vector reg (0) that is
|
||||
* not used to pass return values.
|
||||
* If -mvrsave=yes was used this should cause gcc to
|
||||
* set bit 0 in vrsave. OTOH this bit cannot be set
|
||||
* because v0 is volatile and not used to pass a value
|
||||
* to the caller...
|
||||
*/
|
||||
asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0");
|
||||
if ( p_v ) {
|
||||
*p_v = x;
|
||||
}
|
||||
return vrsave;
|
||||
}
|
||||
|
||||
static int vrsave_yes(void) __attribute__((noinline));
|
||||
|
||||
static int vrsave_yes(void)
|
||||
{
|
||||
uint32_t vrsave_pre;
|
||||
asm volatile("mfvrsave %0":"=r"(vrsave_pre));
|
||||
if ( (vrsave_pre & 0x80000000) ) {
|
||||
printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n");
|
||||
return 0;
|
||||
}
|
||||
return probe_vrsave(0) != vrsave_pre;
|
||||
}
|
||||
|
||||
extern void
|
||||
_CPU_altivec_set_vrsave_initval(uint32_t);
|
||||
|
||||
|
||||
void
|
||||
_CPU_Initialize_altivec(void)
|
||||
{
|
||||
unsigned pvr;
|
||||
|
||||
/* I don't like to have to #define the offset of the altivec area
|
||||
* for use by assembly code.
|
||||
* Therefore, we compute it here and store it in memory...
|
||||
*/
|
||||
_CPU_altivec_ctxt_off = (uint32_t) &((Context_Control*)0)->altivec;
|
||||
/*
|
||||
* Add space possibly needed for alignment
|
||||
*/
|
||||
_CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1;
|
||||
|
||||
if ( ! vrsave_yes() ) {
|
||||
/* They seemed to compile with -mvrsave=no. Hence we
|
||||
* must set VRSAVE so that all registers are saved/restored
|
||||
* in case this support was not built with IGNORE_VRSAVE.
|
||||
*/
|
||||
_CPU_altivec_set_vrsave_initval( -1 );
|
||||
}
|
||||
|
||||
if ( check_stack_alignment() & 2 )
|
||||
rtems_fatal_error_occurred(ERRID('V','E','C','1'));
|
||||
|
||||
pvr = get_ppc_cpu_type();
|
||||
/* psim has altivec but lacks the streaming instructions :-( */
|
||||
_CPU_altivec_psim_cpu = (PPC_PSIM == pvr);
|
||||
|
||||
if ( ! ppc_cpu_has_altivec() ) {
|
||||
printk(NAM": This CPU seems not to have AltiVec\n");
|
||||
rtems_panic("Unable to initialize AltiVec Support\n");
|
||||
}
|
||||
|
||||
if ( ! (mfmsr() & MSR_VE) ) {
|
||||
printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n");
|
||||
set_MSR_VE();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
783
c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S
Normal file
783
c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S
Normal file
@@ -0,0 +1,783 @@
|
||||
#ifdef __ALTIVEC__
|
||||
|
||||
#include <rtems/powerpc/powerpc.h>
|
||||
|
||||
#ifndef PPC_CACHE_ALIGNMENT
|
||||
#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
|
||||
#endif
|
||||
|
||||
#define ALTIVEC_TESTING
|
||||
|
||||
#if PPC_CACHE_ALIGNMENT != 32
|
||||
#error "Altivec support assumes cache-line size is 32 bytes!"
|
||||
#else
|
||||
#undef LD_PPC_CACHE_ALIGNMENT
|
||||
#define LD_PPC_CACHE_ALIGNMENT 5
|
||||
#endif
|
||||
|
||||
.set v0, 0
|
||||
.set v8, 8
|
||||
.set v16, 16
|
||||
.set v20, 20
|
||||
.set v24, 24
|
||||
.set v28, 28
|
||||
|
||||
.set r0, 0
|
||||
.set r3, 3
|
||||
.set r4, 4
|
||||
.set r5, 5
|
||||
.set r6, 6
|
||||
.set r7, 7
|
||||
|
||||
.set r10, 10
|
||||
.set r11, 11
|
||||
.set r12, 12
|
||||
|
||||
.set cr5, 5
|
||||
|
||||
.set VECSIZE, 16
|
||||
|
||||
.set VRSAVE_INIT_VAL, 0
|
||||
.set VSCR_INIT_VAL, 0
|
||||
|
||||
.set VRSAVE_OFF, 16
|
||||
.set VSCR_OFF, 16+12
|
||||
|
||||
.set ds0, 0
|
||||
|
||||
/* Block size for dst -- in units of 16-bytes */
|
||||
.set BSIZE, 2 /* = 32 bytes */
|
||||
.set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */
|
||||
.set BSTRIDE, 32 /* bytes */
|
||||
|
||||
.data
|
||||
|
||||
.global _CPU_altivec_vrsave_initval
|
||||
_CPU_altivec_vrsave_initval:
|
||||
.long 0
|
||||
|
||||
.global _CPU_altivec_vscr_initval
|
||||
_CPU_altivec_vscr_initval:
|
||||
.long 0
|
||||
|
||||
.text
|
||||
|
||||
.extern _CPU_altivec_psim_cpu
|
||||
.extern _CPU_altivec_ctxt_off
|
||||
|
||||
.macro CMPOFF _B0
|
||||
lis \_B0, _CPU_altivec_ctxt_off@ha
|
||||
lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
|
||||
.endm
|
||||
|
||||
/* Conditionally load or store a vector _VR to
|
||||
* EA(_R1|0 + _R2)
|
||||
* If bit _VR (corresponding to _VR) is set in CRC
|
||||
* then the load/store is performed but otherwise
|
||||
* it is skipped.
|
||||
* If compiled with IGNORE_VRSAVE defined then
|
||||
* the load/store is done unconditionally.
|
||||
*
|
||||
* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
|
||||
* _VR : target vector register
|
||||
* _R1 : base register (NOTE: _R1=r0 uses a
|
||||
* implicit ZERO constant, not the contents
|
||||
* of r0) for address computation.
|
||||
* _R2 : 'offset' register for address computation.
|
||||
*
|
||||
* MODIFIES: _VR on output if a load operation is performed.
|
||||
* IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE
|
||||
* defined.
|
||||
*/
|
||||
.macro LDST _OPCODE, _VR, _R1, _R2
|
||||
#ifndef IGNORE_VRSAVE
|
||||
bc 4, \_VR, 111f
|
||||
#endif
|
||||
\_OPCODE \_VR, \_R1, \_R2
|
||||
111:
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Load or store four 'adjacent' vector registers.
|
||||
*
|
||||
* _OPCODE: intended to be lvx, lvxl, stvx or stvxl
|
||||
* _VR : target vector register
|
||||
* _R1 : base register (NOTE: _R1=r0 uses a
|
||||
* implicit ZERO constant, not the contents
|
||||
* of r0) for address computation.
|
||||
* _B0 : base register 0
|
||||
* _B1 : base register 1
|
||||
* _B2 : base register 2
|
||||
* _B3 : base register 3
|
||||
* _RO : offset register
|
||||
*
|
||||
* memory addresses for _VR, _VR+1, _VR+2, _VR+3
|
||||
* are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
|
||||
*
|
||||
* MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load
|
||||
* operation is performed.
|
||||
* IMPLICIT USE: see LDST
|
||||
*/
|
||||
.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
|
||||
LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
|
||||
LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
|
||||
LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
|
||||
LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Preload/zero two cache lines and save 4 vector registers
|
||||
* to memory.
|
||||
* Note that the cache operation targets memory *past* the
|
||||
* current storage area which should hopefully hit when
|
||||
* This same code is executed on the next two cache lines...
|
||||
*
|
||||
* This code effectively does
|
||||
* dcbz (_B0 + 64)
|
||||
* dcbz (_B0 + 64 + 32)
|
||||
* stvx _VF+0, (_B0+ 0)
|
||||
* stvx _VF+1, (_B0+16)
|
||||
* stvx _VF+2, (_B0+32)
|
||||
* stvx _VF+3, (_B0+48)
|
||||
*
|
||||
* _LRU: may be 'l' or empty. The former variant should be
|
||||
* used when it is conceivable that the memory area is
|
||||
* unlikely to be used in the near future thus making
|
||||
* it a candidate for early eviction from the caches.
|
||||
*
|
||||
* If it is likely that the memory area is reused soon
|
||||
* (e.g., save/restore across ISR execution) then the
|
||||
* 'stvx' opcode (w/o 'l' suffix) should be used.
|
||||
*
|
||||
* _VR: first of four target vector registers; _VR+0,
|
||||
* _VR+1, _VR+2, _VR+3 are saved.
|
||||
*
|
||||
* _BO: base address of memory area.
|
||||
* _B1: should contain _B0+16 on entry
|
||||
* _B2: should contain _B0+32 on entry
|
||||
* _B3: should contain _B0+48 on entry
|
||||
*
|
||||
* _O1: contains the offset where the four vectors are
|
||||
* stored.
|
||||
* _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 )
|
||||
* _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
|
||||
* _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
|
||||
* _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
|
||||
* _O2: is set to _O1 + 64 by this macro. Hence _O2 is
|
||||
* used to address the two cache-lines past the
|
||||
* current memory area.
|
||||
*
|
||||
* MODIFIES: _O2; contains _O1 + 64 after execution of this
|
||||
* code.
|
||||
*
|
||||
* NOTES: a different set of four vectors can be addressed
|
||||
* simply by changing the one offset register _O1.
|
||||
*
|
||||
* Saving more than 4 registers can simply be
|
||||
* achieved by expanding this macro multiple
|
||||
* times with _O1 and _O2 swapped (new _O1
|
||||
* becomes _O2 = old _O1 + 64) thus stepping
|
||||
* through the memory area.
|
||||
*
|
||||
*/
|
||||
.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
|
||||
dcbz \_B0, \_O2
|
||||
dcbz \_B2, \_O2
|
||||
LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Save eight vector registers by expanding S4VEC_P twice.
|
||||
* See notes for S4VEC_P above.
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
|
||||
*
|
||||
* MODIFIES: After execution,
|
||||
* _O2 contains original _O1 + 64,
|
||||
* _O1 contains original _O1 + 128
|
||||
*
|
||||
* NOTES: Expanding this macro multiple times lets you save
|
||||
* multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
|
||||
*/
|
||||
.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
|
||||
/* Note that the roles of _O1 and _O2 are swapped here */
|
||||
S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
|
||||
*
|
||||
* See notes above (for S4VEC_P).
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
|
||||
* MODIFIES: _O1 contains original _O1 + 256
|
||||
* _O2 contains original _O1 + 256 - 64
|
||||
*/
|
||||
.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
|
||||
S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
|
||||
LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
|
||||
*
|
||||
* See notes above (for S4VEC_P, S_V0TOV19).
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
|
||||
* MODIFIES: _O1 contains original _O1 + 128
|
||||
* _O2 contains original _O1 + 128 - 64
|
||||
*/
|
||||
.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
|
||||
LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Save all registers to memory area
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
|
||||
* MODIFIES: _O1 contains original _O1 + 512
|
||||
* _O2 contains original _O1 + 512 - 64
|
||||
*/
|
||||
.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
|
||||
S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
|
||||
S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
|
||||
S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
|
||||
S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
|
||||
LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2
|
||||
.endm
|
||||
|
||||
|
||||
/*
|
||||
* Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
|
||||
* We can pass either of them as arguments to another macro which
|
||||
* allows us to decide if the main macro uses dcbt or not when
|
||||
* we expand it...
|
||||
*/
|
||||
.macro DO_DCBT _RA, _RB
|
||||
dcbt \_RA, \_RB
|
||||
.endm
|
||||
|
||||
.macro NO_DCBT _RA, _RB
|
||||
.endm
|
||||
|
||||
/*
|
||||
* NOTE REGARDING dcbt VS dst
|
||||
*
|
||||
* Preloading the cache with memory areas that we soon need
|
||||
* can be done either using 'dcbt' or 'dst' instructions
|
||||
* "ahead of time".
|
||||
* When experimenting (on a mpc7457) I found that the 'dst'
|
||||
* stream instruction was very efficient if there is enough
|
||||
* time to read ahead. It works well when we do a context
|
||||
* switch:
|
||||
*
|
||||
* 1) start DST on new context to be loaded
|
||||
* 2) save old context to memory
|
||||
* 3) load new context from memory
|
||||
*
|
||||
* Because of the interleaved step 2) dst works nicely and
|
||||
* 3) finds what it needs in the cache.
|
||||
*
|
||||
* However, in a situation when there is not much time
|
||||
* to start the DST, e.g., because we want to restore
|
||||
* a context out of the blue (e.g., after returning
|
||||
* from and ISR):
|
||||
*
|
||||
* 1) save volatile registers to memory/stack
|
||||
* 2) execute ISR
|
||||
* 3) might do a task context switch
|
||||
* 4) when returned to old task context then
|
||||
* reload volatile registers from memory/stack.
|
||||
*
|
||||
* In this situation, preloading the target memory before
|
||||
* or after step 1) makes obviously no sense because after
|
||||
* 1) the registers area is most likely in the cache already.
|
||||
*
|
||||
* Starting preload after 2) doesn't make much sense either.
|
||||
* If ISR doesn't lead to a context switch then it is quite
|
||||
* likely that the register area is still in the cache.
|
||||
* OTOTH, if a context switch happens then the preload after 2)
|
||||
* might be useless.
|
||||
*
|
||||
* This leaves us at step 4) where we want to load immediately.
|
||||
* In this case, I found that 'dcbt' works more efficiently
|
||||
* so that's what we use when restoring volatile registers.
|
||||
*
|
||||
* When restoring the non-volatile VRs during a 'normal'
|
||||
* context switch then we shall use DST (and no dcbt).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Symmetric to S4VEC_P above but addresses loading four
|
||||
* vector registers from memory.
|
||||
*
|
||||
* Touches two cache lines past the current memory area
|
||||
* and loads four vectors from the current area.
|
||||
*
|
||||
* Optionally, the DCBT operation may be omitted
|
||||
* (when expanding with _DCBT=NO_DCBT).
|
||||
* This is useful if the cache was already preloaded
|
||||
* by another means (dst instruction).
|
||||
*
|
||||
* NOTE: We always use the 'LRU' form of lvx: lvxl,
|
||||
* because we deem it unlikely that the context
|
||||
* that was just loaded has to be saved again
|
||||
* to memory in the immediate future.
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
|
||||
* as explained above.
|
||||
*
|
||||
* MODIFIES: _O2 contains original _O1 + 64.
|
||||
* _VR.._VR+3 loaded from memory.
|
||||
*/
|
||||
.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
|
||||
/* preload/touch 2 lines at offset 64 from _B0 */
|
||||
\_DCBT \_B0, \_O2
|
||||
\_DCBT \_B2, \_O2
|
||||
/* load four vectors at off set 0 from _B0 */
|
||||
LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Symmetric to S8VEC_P; loads 8 vector registers
|
||||
* from memory -- see comments above...
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
|
||||
* as explained above.
|
||||
*
|
||||
* MODIFIES: _O1 contains original _O1 + 128.
|
||||
* _O2 contains original _O1 + 64.
|
||||
* _VR.._VR+7 loaded from memory.
|
||||
*/
|
||||
.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2
|
||||
L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Load volatile vector registers v0..v19 employing
|
||||
* the DCBT to preload the cache. The rationale for
|
||||
* using DCBT here but not when restoring non-volatile
|
||||
* registers is explained above, see
|
||||
*
|
||||
* "NOTE REGARDING dcbt VS dst"
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
|
||||
* as explained above.
|
||||
*
|
||||
* MODIFIES: _O1 contains original _O1 + 256.
|
||||
* _O2 contains original _O1 + 256 - 64.
|
||||
* VR0..VR19 loaded from memory.
|
||||
*/
|
||||
.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
|
||||
L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Load non-volatile vector registers v20..v31.
|
||||
* Note that no DCBT is performed since we use
|
||||
* DST for preloading the cache during a context
|
||||
* switch, see
|
||||
*
|
||||
* "NOTE REGARDING dcbt VS dst"
|
||||
*
|
||||
* INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded
|
||||
* as explained above.
|
||||
*
|
||||
* MODIFIES: _O1 contains original _O1 + 128.
|
||||
* _O2 contains original _O1 + 128 - 64.
|
||||
* VR20..VR31 loaded from memory.
|
||||
*/
|
||||
.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
|
||||
L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Load all registers from memory area.
|
||||
*/
|
||||
.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
|
||||
L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
|
||||
LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Compute
|
||||
* _B1 = _B0 + 16
|
||||
* _B2 = _B0 + 32
|
||||
* _B3 = _B0 + 48
|
||||
* and load
|
||||
* _RO = 0
|
||||
*
|
||||
* convenience macro to be expanded before
|
||||
* any of the load/store macros that use
|
||||
* four base addresses etc.
|
||||
*
|
||||
* INPUT: _B0 = cache-aligned start of memory area
|
||||
*
|
||||
* MODIFIES: _B1, _B2, _B3, _RO as described above.
|
||||
*/
|
||||
.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
|
||||
addi \_B1, \_B0, 1*VECSIZE
|
||||
addi \_B2, \_B0, 2*VECSIZE
|
||||
addi \_B3, \_B0, 3*VECSIZE
|
||||
li \_RO, 0
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Prepare for saving general vector registers.
|
||||
*
|
||||
* If not built with #define IGNORE_VRSAVE then
|
||||
*
|
||||
* 1) copy vrsave to CRC
|
||||
*
|
||||
* endif
|
||||
*
|
||||
* 2) copy vrsave to _VRSAVE_REG
|
||||
* 3) preload/zero cache line where vrsave and vscr are stored.
|
||||
* 4) compute base adresses from _B0
|
||||
* 5) preload/zero first two cache lines (remember that the
|
||||
* first S8VEC_P starts preloading/zeroing at offset 64).
|
||||
*
|
||||
* INPUT: 'vrsave' register, _B0 (base address of memory area)
|
||||
* MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
|
||||
* _B0 = original _BO + 32
|
||||
* _B1 = original _B0 + 32 + 16,
|
||||
* _B2 = original _B0 + 32 + 32,
|
||||
* _B3 = original _B0 + 32 + 48,
|
||||
* CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
|
||||
*/
|
||||
.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
|
||||
mfvrsave \_VRSAVE_REG
|
||||
#ifndef IGNORE_VRSAVE
|
||||
mtcr \_VRSAVE_REG
|
||||
#endif
|
||||
dcbz 0, \_B0
|
||||
addi \_B0, \_B0, PPC_CACHE_ALIGNMENT
|
||||
dcbz 0, \_B0
|
||||
CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
|
||||
dcbz 0, \_B2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
|
||||
* must have been loaded from 'vrsave' and 'vscr', respectively,
|
||||
* prior to expanding this macro.
|
||||
*
|
||||
* INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents
|
||||
* _VSCR_VREG VR holding 'vscr' contents
|
||||
* _B0 cache-aligned (base) address of memory area.
|
||||
* MODIFIES: _SCRATCH_REG
|
||||
*/
|
||||
.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
|
||||
stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
|
||||
li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
|
||||
stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Load 'vrsave' and 'vscr' from memory.
|
||||
*
|
||||
* INPUTS: _B0 cache-aligned (base) address of memory area.
|
||||
* MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
|
||||
* 'vscr', 'vrsave'.
|
||||
* CRC (holds contents of 'vrsave') (ONLY IF COMPILED
|
||||
* with IGNORE_VRSAVE undefined).
|
||||
*/
|
||||
.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
|
||||
lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
|
||||
mtvrsave \_SCRATCH_REG
|
||||
#ifndef IGNORE_VRSAVE
|
||||
mtcr \_SCRATCH_REG
|
||||
#endif
|
||||
li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
|
||||
lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
|
||||
mtvscr \_SCRATCH_VREG
|
||||
.endm
|
||||
|
||||
/*
|
||||
* _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
|
||||
*
|
||||
* INPUT: _B0
|
||||
* MODIFIES: _B0 (as stated above)
|
||||
*/
|
||||
.macro CACHE_DOWNALGN _B0
|
||||
rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
|
||||
.endm
|
||||
|
||||
.text
|
||||
|
||||
.global _CPU_save_altivec_volatile
|
||||
_CPU_save_altivec_volatile:
|
||||
/* Align address up to next cache-line boundary */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT - 1
|
||||
CACHE_DOWNALGN r3
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
|
||||
* when testing if we really should do the load/store operation.
|
||||
*/
|
||||
mfcr r12
|
||||
#endif
|
||||
|
||||
PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
|
||||
/* r0 now contains VRSAVE, r3 still the aligned memory area
|
||||
* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
|
||||
* respectively. r10 holds zero
|
||||
*/
|
||||
S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
|
||||
mfvscr v0
|
||||
/* Store vrsave (still in r0) and vscr (in v0) to memory area */
|
||||
S_VSCR_VRSAVE r0, v0, r3, r11
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Restore CRC */
|
||||
mtcr r12
|
||||
#endif
|
||||
blr
|
||||
|
||||
.global _CPU_load_altivec_volatile
|
||||
_CPU_load_altivec_volatile:
|
||||
/* Align address up to next cache-line boundary */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT - 1
|
||||
CACHE_DOWNALGN r3
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
|
||||
* when testing if we really should do the load/store operation.
|
||||
*/
|
||||
mfcr r12
|
||||
#endif
|
||||
|
||||
/* Try to preload 1st line (where vscr and vrsave are stored) */
|
||||
dcbt 0, r3
|
||||
/* Point to start of general vector-register area */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT
|
||||
/* Start preloading 2nd line (where first two vectors are) */
|
||||
dcbt 0, r3
|
||||
L_VSCR_VRSAVE r3, r0, v0
|
||||
CMP_BASES r3, r4, r5, r6, r10
|
||||
/* Start preloading 3rd line (where vectors 3 and 4 are) */
|
||||
dcbt 0, r5
|
||||
L_V0TOV19 r3, r4, r5, r6, r10, r11
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
mtcr r12
|
||||
#endif
|
||||
blr
|
||||
|
||||
.global _CPU_Context_restore_altivec
|
||||
_CPU_Context_restore_altivec:
|
||||
/* Restore is like 'switch' but we don't have
|
||||
* to save an old context.
|
||||
* Move argument to second arg and load NULL pointer
|
||||
* to first one, then jump to 'switch' routine.
|
||||
*/
|
||||
mr r4, r3
|
||||
li r3, 0
|
||||
b _CPU_Context_switch_altivec
|
||||
|
||||
.global _CPU_Context_switch_altivec
|
||||
_CPU_Context_switch_altivec:
|
||||
|
||||
/* fetch offset of altivec area in context */
|
||||
CMPOFF r5
|
||||
/* down-align 'to' area to cache-line boundary */
|
||||
add r4, r4, r5
|
||||
CACHE_DOWNALGN r4
|
||||
|
||||
/* Check for PSIM */
|
||||
lis r6, _CPU_altivec_psim_cpu@ha
|
||||
lwz r6, _CPU_altivec_psim_cpu@l(r6)
|
||||
cmpli 0, r6, 0
|
||||
bne 1f
|
||||
/* Skip data-stream instructions on PSIM (not implemented) */
|
||||
dssall
|
||||
/* Pre-load new context into cache */
|
||||
lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
|
||||
ori r6, r6, BSTRIDE
|
||||
dstt r4, r6, ds0
|
||||
1:
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
|
||||
* when testing if we really should do the load/store operation.
|
||||
*/
|
||||
mfcr r12
|
||||
#endif
|
||||
|
||||
/* Is 'from' context == NULL ? (then we just do a 'restore') */
|
||||
cmpli 0, r3, 0
|
||||
beq 1f /* yes: skip saving 'from' context */
|
||||
|
||||
/* SAVE NON-VOLATILE REGISTERS */
|
||||
|
||||
/* Compute aligned destination pointer (r5 still holds offset
|
||||
* to 'altivec' area in context)
|
||||
*/
|
||||
add r3, r3, r5
|
||||
CACHE_DOWNALGN r3
|
||||
|
||||
PREP_FOR_SAVE r0, r3, r5, r6, r7, r10
|
||||
/* The manual says reading vscr can take some time - do
|
||||
* read it here (into a volatile vector register) while
|
||||
* we wait for cache blocks to be allocated
|
||||
*/
|
||||
mfvscr v0
|
||||
S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11
|
||||
/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
|
||||
S_VSCR_VRSAVE r0, v0, r3, r5
|
||||
|
||||
1:
|
||||
|
||||
/* LOAD NON-VOLATILE REGISTERS */
|
||||
|
||||
/* Advance past vrsave/vscr area */
|
||||
addi r4, r4, PPC_CACHE_ALIGNMENT
|
||||
L_VSCR_VRSAVE r4, r0, v0
|
||||
CMP_BASES r4, r5, r6, r7, r10
|
||||
L_V20TOV31 r4, r5, r6, r7, r10, r11
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
mtcr r12
|
||||
#endif
|
||||
blr
|
||||
|
||||
.global _CPU_Context_initialize_altivec
|
||||
_CPU_Context_initialize_altivec:
|
||||
CMPOFF r5
|
||||
add r3, r3, r5
|
||||
CACHE_DOWNALGN r3
|
||||
lis r5, _CPU_altivec_vrsave_initval@ha
|
||||
lwz r5, _CPU_altivec_vrsave_initval@l(r5)
|
||||
stw r5, VRSAVE_OFF(r3)
|
||||
lis r6, _CPU_altivec_vscr_initval@ha
|
||||
lwz r6, _CPU_altivec_vscr_initval@l(r6)
|
||||
stw r6, VSCR_OFF(r3)
|
||||
blr
|
||||
|
||||
/*
|
||||
* Change the initial value of VRSAVE.
|
||||
* Can be used by initialization code if
|
||||
* it is determined that code was compiled
|
||||
* with -mvrsave=no. In this case, VRSAVE
|
||||
* must be set to all-ones which causes this
|
||||
* support code to save/restore *all* registers
|
||||
* (only has an effect if IGNORE_VRSAVE is
|
||||
* not defined -- otherwise all registers are
|
||||
* saved/restored anyways).
|
||||
*/
|
||||
.global _CPU_altivec_set_vrsave_initval
|
||||
_CPU_altivec_set_vrsave_initval:
|
||||
lis r5, _CPU_altivec_vrsave_initval@ha
|
||||
stw r3, _CPU_altivec_vrsave_initval@l(r5)
|
||||
mtvrsave r3
|
||||
blr
|
||||
|
||||
#ifdef ALTIVEC_TESTING
|
||||
.global msr_VE_on
|
||||
msr_VE_on:
|
||||
mfmsr r3
|
||||
oris r3, r3, 1<<(31-6-16)
|
||||
mtmsr r3
|
||||
blr
|
||||
|
||||
.global msr_VE_off
|
||||
msr_VE_off:
|
||||
mfmsr r3
|
||||
lis r4, 1<<(31-6-16)
|
||||
andc r3, r3, r4
|
||||
mtmsr r3
|
||||
blr
|
||||
|
||||
|
||||
.global mfvrsave
|
||||
mfvrsave:
|
||||
mfvrsave r3
|
||||
blr
|
||||
|
||||
.global mtvrsave
|
||||
mtvrsave:
|
||||
mtvrsave r3
|
||||
blr
|
||||
|
||||
/* Load all vector registers from memory area.
|
||||
* NOTE: This routine is not strictly ABI compliant --
|
||||
* it guarantees that volatile vector registers
|
||||
* have certain values on exit!
|
||||
*/
|
||||
.global _CPU_altivec_load_all
|
||||
_CPU_altivec_load_all:
|
||||
/* Align address up to next cache-line boundary */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT - 1
|
||||
CACHE_DOWNALGN r3
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
|
||||
* when testing if we really should do the load/store operation.
|
||||
*/
|
||||
mfcr r12
|
||||
#endif
|
||||
|
||||
/* Try to preload 1st line (where vscr and vrsave are stored) */
|
||||
dcbt 0, r3
|
||||
/* Point to start of general vector-register area */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT
|
||||
/* Start preloading 2nd line (where first two vectors are) */
|
||||
dcbt 0, r3
|
||||
L_VSCR_VRSAVE r3, r0, v0
|
||||
CMP_BASES r3, r4, r5, r6, r10
|
||||
/* Start preloading 3rd line (where vectors 3 and 4 are) */
|
||||
dcbt 0, r5
|
||||
L_V0TOV31 r3, r4, r5, r6, r10, r11
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
mtcr r12
|
||||
#endif
|
||||
blr
|
||||
|
||||
.global _CPU_altivec_save_all
|
||||
_CPU_altivec_save_all:
|
||||
/* Align address up to next cache-line boundary */
|
||||
addi r3, r3, PPC_CACHE_ALIGNMENT - 1
|
||||
CACHE_DOWNALGN r3
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
|
||||
* when testing if we really should do the load/store operation.
|
||||
*/
|
||||
mfcr r12
|
||||
#endif
|
||||
|
||||
PREP_FOR_SAVE r0, r3, r4, r5, r6, r10
|
||||
/* r0 now contains VRSAVE, r3 still the aligned memory area
|
||||
* and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3,
|
||||
* respectively. r10 holds zero
|
||||
*/
|
||||
S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11
|
||||
mfvscr v0
|
||||
/* Store vrsave (still in r0) and vscr (in v0) to memory area */
|
||||
S_VSCR_VRSAVE r0, v0, r3, r11
|
||||
|
||||
#ifndef IGNORE_VRSAVE
|
||||
/* Restore CRC */
|
||||
mtcr r12
|
||||
#endif
|
||||
blr
|
||||
|
||||
|
||||
#if 0
|
||||
.gnu_attribute 4,1
|
||||
.gnu_attribute 8,1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
Reference in New Issue
Block a user