Skip to content
This repository was archived by the owner on Dec 2, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/GBI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ MicrocodeInfo* GBI_DetectMicrocode(u32 uc_start, u32 uc_dstart, u16 uc_dsize)

// See if we can identify it by text
char uc_data[2048];
UnswapCopy(&RDRAM[uc_dstart & 0x1FFFFFFF], uc_data, 2048);
unswap_copy(&RDRAM[uc_dstart & 0x1FFFFFFF], (uint8_t*)uc_data, 2048);
strcpy(uc_str, "Not Found");

for (u32 i = 0; i < 2048; i++)
Expand Down
2 changes: 1 addition & 1 deletion src/Textures.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ void TextureCache_LoadBackground(CachedTexture* texInfo)
bpl = gSP.bgImage.width << gSP.bgImage.size >> 1;
numBytes = bpl * gSP.bgImage.height;
swapped = (u8*)malloc(numBytes);
UnswapCopy(&RDRAM[gSP.bgImage.address], swapped, numBytes);
unswap_copy(&RDRAM[gSP.bgImage.address], swapped, numBytes);
dest = (u32*)malloc(texInfo->textureBytes);

clampSClamp = texInfo->width - 1;
Expand Down
191 changes: 60 additions & 131 deletions src/convert.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#ifndef CONVERT_H
#define CONVERT_H
#pragma once

const unsigned char Five2Eight[32] =
{
Expand Down Expand Up @@ -100,139 +99,70 @@ const unsigned char One2Eight[2] =
255, // 1 = 11111111
};

// Un-swaps on the dword, works with non-dword aligned addresses
/*inline void UnswapCopy( void *src, void *dest, u32 numBytes )
inline void bswap_4_x32_sse2(__m128i& vec)
{
__asm
__m128i tmp1 = _mm_srli_epi32(vec, 24);
__m128i tmp2 = _mm_slli_epi32(vec, 24);

__m128i t1 = _mm_and_si128(_mm_srli_epi32(vec, 8), _mm_set1_epi32(0x0000FF00));
__m128i t2 = _mm_and_si128(_mm_slli_epi32(vec, 8), _mm_set1_epi32(0x00FF0000));

vec = _mm_or_si128(tmp2, tmp1);
vec = _mm_or_si128(vec, t1);
vec = _mm_or_si128(vec, t2);
}

/**
* \brief Copies data from a source buffer to a destination buffer while performing a byteswap within 4-byte groups.
* \param src The source buffer.
* \param dst The destination buffer.
* \param num_bytes The number of bytes to copy.
*/
inline void unswap_copy(uint8_t* src, uint8_t* dst, u32 num_bytes)
{
const uintptr_t src_addr = reinterpret_cast<uintptr_t>(src);
u32 leading_bytes = src_addr & 3;

if (leading_bytes != 0)
{
mov ecx, 0
mov esi, dword ptr [src]
mov edi, dword ptr [dest]

mov ebx, esi
and ebx, 3 // ebx = number of leading bytes

cmp ebx, 0
jz StartDWordLoop

neg ebx
add ebx, 4
cmp ebx, [numBytes]
jle NotGreater
mov ebx, [numBytes]
NotGreater:
mov ecx, ebx

xor esi, 3

LeadingLoop: // Copies leading bytes, in reverse order (un-swaps)
mov al, byte ptr [esi]
mov byte ptr [edi], al
sub esi, 1
add edi, 1
loop LeadingLoop
add esi, 5

StartDWordLoop:
mov ecx, dword ptr [numBytes]
sub ecx, ebx // Don't copy what's already been copied

mov ebx, ecx
and ebx, 3 // ebx = number of trailing bytes

shr ecx, 2 // ecx = number of dwords

cmp ecx, 0 // If there's nothing to do, don't do it
jz StartTrailingLoop

// Copies from source to destination, bswap-ing first
DWordLoop:
mov eax, dword ptr [esi]
bswap eax
mov dword ptr [edi], eax
add esi, 4
add edi, 4
loop DWordLoop

StartTrailingLoop:
cmp ebx, 0
jz Done
mov ecx, ebx
add esi, 3

TrailingLoop:
mov al, byte ptr [esi]
mov byte ptr [esi], al
sub esi, 1
add edi, 1
loop TrailingLoop
Done:
leading_bytes = 4 - leading_bytes;
leading_bytes = min(leading_bytes, num_bytes);

for (u32 i = 0; i < leading_bytes; ++i)
dst[i] = src[3 - i];

src += leading_bytes;
dst += leading_bytes;
num_bytes -= leading_bytes;
}
}*/

inline void UnswapCopy(void* src, void* dest, u32 numBytes)
{
__asm
const u32 sse_block_size = (num_bytes / 16) * 16;
for (u32 i = 0; i < sse_block_size; i += 16)
{
mov ecx, 0
mov esi, dword ptr [src]
mov edi, dword ptr [dest]

mov ebx, esi
and ebx, 3 // ebx = number of leading bytes

cmp ebx, 0
jz StartDWordLoop
neg ebx
add ebx, 4

cmp ebx, [numBytes]
jle NotGreater
mov ebx, [numBytes]
NotGreater:
mov ecx, ebx
xor esi, 3
LeadingLoop: // Copies leading bytes, in reverse order (un-swaps)
mov al, byte ptr [esi]
mov byte ptr [edi], al
sub esi, 1
add edi, 1
loop LeadingLoop
add esi, 5

StartDWordLoop:
mov ecx, dword ptr [numBytes]
sub ecx, ebx // Don't copy what's already been copied

mov ebx, ecx
and ebx, 3
// add ecx, 3 // Round up to nearest dword
shr ecx, 2

cmp ecx, 0 // If there's nothing to do, don't do it
jle StartTrailingLoop

// Copies from source to destination, bswap-ing first
DWordLoop:
mov eax, dword ptr [esi]
bswap eax
mov dword ptr [edi], eax
add esi, 4
add edi, 4
loop DWordLoop
StartTrailingLoop:
cmp ebx, 0
jz Done
mov ecx, ebx
xor esi, 3

TrailingLoop:
mov al, byte ptr [esi]
mov byte ptr [edi], al
sub esi, 1
add edi, 1
loop TrailingLoop
Done:
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
bswap_4_x32_sse2(data);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst), data);
src += 16;
dst += 16;
}

num_bytes -= sse_block_size;

for (u32 i = 0; i < num_bytes / 4; ++i)
{
uint32_t val{};
std::memcpy(&val, src, sizeof(uint32_t));
val = (val >> 24 & 0x000000FF) | (val >> 8) & 0x0000FF00 | (val << 8) & 0x00FF0000 | (val << 24) & 0xFF000000;
std::memcpy(dst, &val, sizeof(uint32_t));
src += 4;
dst += 4;
}

const u32 trailing_bytes = num_bytes % 4;
if (trailing_bytes > 0)
{
for (u32 i = 0; i < trailing_bytes; ++i)
dst[i] = src[3 - i];
}
}

Expand Down Expand Up @@ -540,4 +470,3 @@ inline u32 I4_RGBA8888(u8 color)
mov al, cl
}
}
#endif
6 changes: 3 additions & 3 deletions src/gDP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ void gDPLoadTile(u32 tile, u32 uls, u32 ult, u32 lrs, u32 lrt)

for (y = 0; y < height; y++)
{
UnswapCopy(src, dest, bpl);
unswap_copy(src, (uint8_t*)dest, bpl);
if (y & 1)
Interleave(dest, line);

Expand Down Expand Up @@ -643,7 +643,7 @@ void gDPLoadBlock(u32 tile, u32 uls, u32 ult, u32 lrs, u32 dxt)

for (u32 y = 0; y < height; y++)
{
UnswapCopy(src, dest, bpl);
unswap_copy((uint8_t*)src, (uint8_t*)dest, bpl);
if (y & 1)
Interleave(dest, line);

Expand All @@ -652,7 +652,7 @@ void gDPLoadBlock(u32 tile, u32 uls, u32 ult, u32 lrs, u32 dxt)
}
}
else
UnswapCopy(src, dest, bytes);
unswap_copy((uint8_t*)src, (uint8_t*)dest, bytes);

gDP.textureMode = TEXTUREMODE_NORMAL;
gDP.loadType = LOADTYPE_BLOCK;
Expand Down
2 changes: 2 additions & 0 deletions src/stdafx.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <cstdint>
#include <string>
#include <core_plugin.h>
#include <emmintrin.h>
#include <intrin.h>
#include "Types.h"

#define EXPORT __declspec(dllexport)
Expand Down
Loading