Still no luck, also it's actually a division not a multiplication in the loop. However:
Quote:
float4x4 matWorldViewProj;
float4x4 matWorld;
float4x4 matViewProj;
float4x4 matView;
float4x4 matProj;
float4 vecViewPos;
float4 vecViewDir;
float4 vecViewPort;

texture bmp_camera_bmap;
texture bmp_depth_bmap;

sampler CameraSampler = sampler_state
{
Texture = <bmp_camera_bmap>;
AddressU = border;
AddressV = border;
};

sampler DepthSampler = sampler_state
{
Texture = <bmp_depth_bmap>;
AddressU = Clamp;
AddressV = Clamp;
};

// Vertex Shader:
void ReflectVS(
in float4 InPos : POSITION,
in float3 InNormal : NORMAL,
in float2 InTex : TEXCOORD0,

out float2 OutTex : TEXCOORD0,
out float3 OutNormal: TEXCOORD1,
out float3 wPos : TEXCOORD2,
out float4 OutPos : TEXCOORD3,
out float4 pPos : POSITION

)
{
pPos = mul(InPos, matWorldViewProj);
OutPos = pPos;

OutNormal = normalize(mul(InNormal, matWorld));

OutTex = InTex;

wPos = mul(InPos, matWorld);
}

float2 calc_ScreenPos(float4 pPos)
{
return (float2(pPos.x,-pPos.y)/pPos.w+vecViewPort.zw)*0.5+0.5;
}

float2 calc_ScreenPos(float3 pPos)
{
return (float2(pPos.x,-pPos.y)/pPos.z+vecViewPort.zw)*0.5+0.5;
}

float2 calc_ScreenPos(float2 pPos_xy, float depth)
{
return (float2(pPos_xy.x,-pPos_xy.y)/depth+vecViewPort.zw)*0.5+0.5;
}

// Pixel Shader:
void ReflectPS(
in float2 Tex : TEXCOORD0,
in float3 Normal : TEXCOORD1,
in float3 wPos : TEXCOORD2,
in float4 pPos : TEXCOORD3,

out float4 COL : COLOR0
)
{
// Normal = float3(0,1,0);

float2 sTex = calc_ScreenPos(pPos);
float3 screenPos = float3(sTex.xy*pPos.w, pPos.w);
// screenPos = float3(sTex.xy, pPos.w);

Normal = normalize(Normal);
float3 View = normalize(vecViewPos.xyz - wPos);

float3 R = Normal * dot(Normal, View) * 2.f - View;

//--------------//

float3 sR;
sR = mul(R, matViewProj).xyz;
// sR = mul(float4(R,1), matViewProj).xyz;
sR.xy = calc_ScreenPos(sR.xyz) * sR.z;
// sR.xy = calc_ScreenPos(sR.xyz);
// sR.xy/=abs(sR.z);
// sR.y *= -1;

float3 screenDir = (sR.xyz);
screenDir = normalize(sR.xyz);
// screenDir = sR.xyz;
// screenDir.xy /= sR.z;
// screenDir.y *= -1.f;

float3 ori_screenDir = screenDir;

//-------------------------------------//

const int step_count = 32;
const float init_step_size = 40.f;
// const float init_step_size = 0.1f;

float halve_again = 1.f;

float step_size = init_step_size;
float depth;
float2 sUV;

float3 oldPos = screenPos;
float3 newPos = screenPos + screenDir * step_size;

for(int i=0; i<step_count; ++i)
{
sUV = newPos.xy / newPos.z;
// sUV = newPos.xy;

depth = tex2Dlod(DepthSampler, float4(sUV,0,0)).x;

if(newPos.z < depth)
{
oldPos = newPos;

step_size *= halve_again;
halve_again = 1.f;

newPos += screenDir * step_size;
}
else
{
step_size *= 0.5f;
newPos = oldPos + screenDir * step_size;

halve_again = 0.5f;
}
}

sUV = newPos.xy / newPos.z;
// sUV = newPos.xy;

//-------------------------------------//
COL.rgb = ori_screenDir.x;
COL.rgb = (abs(sR.x/sR.z-0)<0.0025);
COL.rgb = -sR.y;
// COL.rgb = sR.z;

if(step_size.x == init_step_size.x)
{
COL = 0;
return;
}

COL = tex2D(CameraSampler, sUV);

//--------------//

COL.a = 1;
}

technique ReflectTechnique
{
pass P0
{
VertexShader = compile vs_3_0 ReflectVS();
PixelShader = compile ps_3_0 ReflectPS();
}
}

It's bugging me since it really shouldn't be that hard to get it running.