Tuesday, 14 October 2008

Raw binary data in Delphi 2009 strings, by example

This code snippet explains by example how you can use binary data in strings in Delphi 2009:

const
AllByteValues=
#$00#$01#$02#$03#$04#$05#$06#$07#$08#$09#$0a#$0b#$0c#$0d#$0e#$0f+
#$10#$11#$12#$13#$14#$15#$16#$17#$18#$19#$1a#$1b#$1c#$1d#$1e#$1f+
#$20#$21#$22#$23#$24#$25#$26#$27#$28#$29#$2a#$2b#$2c#$2d#$2e#$2f+
#$30#$31#$32#$33#$34#$35#$36#$37#$38#$39#$3a#$3b#$3c#$3d#$3e#$3f+
#$40#$41#$42#$43#$44#$45#$46#$47#$48#$49#$4a#$4b#$4c#$4d#$4e#$4f+
#$50#$51#$52#$53#$54#$55#$56#$57#$58#$59#$5a#$5b#$5c#$5d#$5e#$5f+
#$60#$61#$62#$63#$64#$65#$66#$67#$68#$69#$6a#$6b#$6c#$6d#$6e#$6f+
#$70#$71#$72#$73#$74#$75#$76#$77#$78#$79#$7a#$7b#$7c#$7d#$7e#$7f+
#$80#$81#$82#$83#$84#$85#$86#$87#$88#$89#$8a#$8b#$8c#$8d#$8e#$8f+
#$90#$91#$92#$93#$94#$95#$96#$97#$98#$99#$9a#$9b#$9c#$9d#$9e#$9f+
#$a0#$a1#$a2#$a3#$a4#$a5#$a6#$a7#$a8#$a9#$aa#$ab#$ac#$ad#$ae#$af+
#$b0#$b1#$b2#$b3#$b4#$b5#$b6#$b7#$b8#$b9#$ba#$bb#$bc#$bd#$be#$bf+
#$c0#$c1#$c2#$c3#$c4#$c5#$c6#$c7#$c8#$c9#$ca#$cb#$cc#$cd#$ce#$cf+
#$d0#$d1#$d2#$d3#$d4#$d5#$d6#$d7#$d8#$d9#$da#$db#$dc#$dd#$de#$df+
#$e0#$e1#$e2#$e3#$e4#$e5#$e6#$e7#$e8#$e9#$ea#$eb#$ec#$ed#$ee#$ef+
#$f0#$f1#$f2#$f3#$f4#$f5#$f6#$f7#$f8#$f9#$fa#$fb#$fc#$fd#$fe#$ff;
RawByteTest=
RawByteString(AllByteValues);
GreekTest=
GreekString(AllByteValues);
AnsiTest=
ansistring(AllByteValues);

procedure TForm3.Button2Click(Sender: TObject);
var
i:0..255;
ErrorList:string;
c:char;
ac:ansichar;
utf16:string;
begin
Assert (length(AllByteValues)=256,'The number of characters is just like in Delphi 2006');
Assert (sizeof(AllByteValues)=4,'This is a pointer');
Assert (sizeof(AllByteValues[1])=2,'But each character is now 2 bytes');
Assert (AllByteValues[1]=#0);
Assert (length(RawByteTest)=256);
Assert (sizeof(RawByteTest)=4,'This is a pointer');
Assert (sizeof(RawByteTest[1])=1,'Using RawByteString in a const the bytes stay as they are');
Assert (RawByteTest[1]=#0);
Assert (RawByteTest[1]=char(0));
Assert (RawByteTest[1]=chr(0));
ac:=#0;
Assert (RawByteTest[1]=ac);
c:=#0;
// Assert (RawByteTest[1]=c); // This line does not compile! - AnsiChar and Char are absolutely not compatible in any way.
Assert (ord(RawByteTest[1])=ord(c)); // This compiles nicely

// Demonstrate how #128..#159 does not exist in Unicode and therefore causes big trouble!
ErrorList:='';
for i:=0 to 255 do begin
if ord(AllByteValues[i+1])<>i then
ErrorList:=ErrorList+IntToStr(i)+' ';
end;
Assert (ErrorList='128 130 131 132 133 134 135 136 137 138 139 '+
'140 142 145 146 147 148 149 150 151 152 153 154 155 156 158 159 ',
'These values are not saved in a string in the way you would expect!!');

// GreekString also destroys constants with binary data
ErrorList:='';
for i:=0 to 255 do begin
if ord(GreekTest[i+1])<>i then
ErrorList:=ErrorList+IntToStr(i)+' ';
end;
Assert (ErrorList='136 138 140 142 152 154 156 158 159 161 162 170 175 '+
'180 184 185 186 188 190 191 192 193 194 195 196 197 198 199 200 201 '+
'202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 '+
'219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 '+
'236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 '+
'253 254 255 ',
'These values are not saved in a string in the way you would expect!!');

// RawByteString stores all bytes correctly
for i:=0 to 255 do begin
Assert (ord(RawByteTest[i+1])=i);
end;

// Ansistring also stores all bytes correctly (tested on a Windows-1252 machine)
for i:=0 to 255 do begin
Assert (ord(AnsiTest[i+1])=i);
end;

// Most common ansistring stuff works as expected
Assert (ord(AnsiTest[129])=128);
Assert (AnsiTest[129]=#128);
Assert (copy(AnsiTest,129,1)=#128);
Assert (MidStr(AnsiTest,129,1)=#128);
Assert (pos(#128,AnsiTest)=129);

// The same functions using UnicodeString
utf16:=AllByteValues;
Assert (ord(utf16[129])=8364);
Assert (utf16[129]=#8364);
Assert (copy(utf16,129,1)=#8364);
Assert (MidStr(utf16,129,1)=#8364);
Assert (pos(#128,utf16)=129); // #128 is converted to #8364 before calling the widestring version of pos()
Assert (pos(#8364,utf16)=129);

// Don't copy raw binary data into an utf-16 string type!
utf16:=RawByteTest;
ErrorList:='';
for i:=0 to 255 do begin
if ord(utf16[i+1])<>i then
ErrorList:=ErrorList+IntToStr(i)+' ';
end;
Assert (ErrorList='128 130 131 132 133 134 135 136 137 138 139 140 142 145 '+
'146 147 148 149 150 151 152 153 154 155 156 158 159 ',
'These values are not saved in a string in the way you would expect!!');

// Windows automatically handles unsupported byte values in strange ways.
c:=#128;
Assert (ord(c)<>128);
Assert (ord(c)=8364);
Assert (c='€');

ac:=#128;
Assert (ord(ac)=128);
Assert (ord(ac)<>8364);
Assert (ac='€','Here, ac is converted to a utf-16 string type using local character set');

// Don't use inc() or dec() with utf-16. It works, but it's not good
utf16:=#127;
Assert (ord(utf16[1])=127);
inc (utf16[1]);
Assert (ord(utf16[1])=128);
Assert (utf16[1]<>#128); // Because #128 becomes #8364
Assert (#128=#8364); // as you can see here
end;


Conclusion: Always use RawByteString or AnsiString for binary data, and never store binary data in other string types.

5 comments:

Anonymous said...

You mean aside from the obvious problem that you can only store an even numbered amount of data in the new double byte based string *OR* have each byte word aligned.

Both of which defeat the purpose?

Anonymous said...

Nonsense.

AnsiString and RawByteString both use one byte per character. Look it up.

Anonymous said...

I think the reference was to the new string type, not the old ansistring type.

Anonymous said...

On my pc(Simplified Chinese),
length(AllByteValues) = 193, not 256.

Serge said...

>AnsiString and RawByteString both
>use one byte per character.

Yes it's true if you are using one byte per character code page (latin or cyrillic).

For hieroglyphs it's not true.