PSP texture swizzling
Internally, the GE processes textures as 16 bytes by 8 rows blocks (independent of actual pixel format, so a 32×32 32-bit texture is a 128×32 texture from "swizzled" point of view). When a texture is not swizzled, the GE will have to do scattered reads as it moves the block into its texture-cache, which has a performance impact.
To improve read performance, texture pixels can be reordered, or "swizzled," into these blocks so the GE can fetch one entire block by reading sequentially.
Example code to re-order a texture into swizzled format:
void swizzle(u8* out, const u8* in, unsigned int width, unsigned int height) {
unsigned int i,j;
unsigned int rowblocks = (width / 16);
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
unsigned int blockx = i / 16;
unsigned int blocky = j / 8;
unsigned int x = (i - blockx*16);
unsigned int y = (j - blocky*8);
unsigned int block_index = blockx + ((blocky) * rowblocks);
unsigned int block_address = block_index * 16 * 8;
out[block_address + x + y * 16] = in[i+j*width];
}
}
}
Or, as an alternative, here’s an optimized version that doesn’t do any heavy math in the inner loop:
void swizzle_fast(u8* out, const u8* in, unsigned int width, unsigned int height) {
unsigned int blockx, blocky;
unsigned int i,j;
unsigned int width_blocks = (width / 16);
unsigned int height_blocks = (height / 8);
unsigned int src_pitch = (width-16)/4;
unsigned int src_row = width * 8;
const u8* ysrc = in;
u32* dst = (u32*)out;
for (blocky = 0; blocky < height_blocks; ++blocky) {
const u8* xsrc = ysrc;
for (blockx = 0; blockx < width_blocks; ++blockx) {
const u32* src = (u32*)xsrc;
for (j = 0; j < 8; ++j) {
*(dst++) = *(src++);
*(dst++) = *(src++);
*(dst++) = *(src++);
*(dst++) = *(src++);
src += src_pitch;
}
xsrc += 16;
}
ysrc += src_row;
}
}
To use a swizzled texture pass GU_TRUE into the swizzled argument of sceGuTexMode
:
sceGuTexMode(GU_PSM_8888,0,0,GU_TRUE);
sceGuTexImage(0,widge,height,width,p_swizzled_data);
Unswizzling is similar:
unsigned swizzle(unsigned offset, unsigned log2_w) {
if (log2_w <= 4)
return offset;
unsigned w_mask = (1 << log2_w) - 1;
unsigned mx = offset & 0xf;
unsigned by = offset & (~7 << log2_w);
unsigned bx = offset & w_mask & ~0xf;
unsigned my = offset & (7 << log2_w);
return by | (bx << 3) | (my >> (log2_w - 4)) | mx;
}
unsigned unswizzle(unsigned offset, unsigned log2_w) {
if (log2_w <= 4)
return offset;
unsigned w_mask = (1 << log2_w) - 1;
unsigned mx = offset & 0xf;
unsigned by = offset & (~7 << log2_w);
unsigned bx = offset & ((w_mask & 0xf) << 7);
unsigned my = offset & 0x70;
return by | (bx >> 3) | (my << (log2_w - 4)) | mx;
}