【C】memcpy真的就只是复制内存吗？然而并没有那么简单。

0xAA55 发表于 2017-5-29 21:14:48

曾有人在QQ群里发了一个代码片段，自己实现memcpy的功能，然后内容大概如下：void COPY(void *dest, void *src, size_t len)
{
char *cdest = dest;
char *csrc = src;
size_t i;
for(i = 0; i < len; i++)
   cdest = csrc;
}代码非常简单直白，通俗易懂，但我们真的有必要造个这样的轮子吗？先不说别的，memcpy真的就是这样实现的吗？
我们来看一下VS2012的memcpy的反汇编。这里我随便搞了个工程然后随便打了些代码，要的就是在执行memcpy的时候戳个断点进去看。

嗯可以看到VS2012的memcpy是拿汇编写的而且我们可以看到源码。我把它复制出来，详细的解释在后面，这里只做参考，可掠过。--- f:\dd\vctools\crt_bld\SELF_X86\crt\src\INTEL\memcpy.asm --------------------
   dst:ptr byte, \
   src:ptr byte, \
   count:IWORD

   ; destination pointer
   ; source pointer
   ; number of bytes to copy

   OPTION PROLOGUE:NONE, EPILOGUE:NONE

   push edi          ;U - save edi
0012B090 57                push    edi
   push esi          ;V - save esi
0012B091 56                push    esi

;                size param/4 prolog byte#reg saved
   .FPO ( 0, 3       , $-_MEM_ , 2, 0, 0 )

   mov esi, ;U - esi = source
0012B092 8B 74 24 10       mov       esi,dword ptr
   mov ecx, ;V - ecx = number of bytes to move
0012B096 8B 4C 24 14       mov       ecx,dword ptr
   mov edi,    ;U - edi = dest
0012B09A 8B 7C 24 0C       mov       edi,dword ptr

;
; Check for overlapping buffers:
;    If (dst <= src) Or (dst >= src + Count) Then
;             Do normal (Upwards) Copy
;    Else
;             Do Downwards Copy to avoid propagation
;

   mov eax,ecx       ;V - eax = byte count...
0012B09E 8B C1             mov       eax,ecx

   mov edx,ecx       ;U - edx = byte count...
0012B0A0 8B D1             mov       edx,ecx
   add eax,esi       ;V - eax = point past source end
0012B0A2 03 C6             add       eax,esi

   cmp edi,esi       ;U - dst <= src ?
0012B0A4 3B FE             cmp       edi,esi
   jbe short CopyUp ;V - yes, copy toward higher addresses
0012B0A6 76 08             jbe       CopyUp (012B0B0h)

   cmp edi,eax       ;U - dst < (src + count) ?
0012B0A8 3B F8             cmp       edi,eax
   jb    CopyDown    ;V - yes, copy toward lower addresses
0012B0AA 0F 82 68 03 00 00 jb       TrailUpVec+50h (012B418h)

;
; Copy toward higher addresses.
;
CopyUp:
;
   ; See if Enhanced Fast Strings is supported.
   ; ENFSTRG supported?
   bt    __favor, __FAVOR_ENFSTRG
0012B0B0 0F BA 25 FC 63 17 00 01 bt       dword ptr ds:,1
   jnc CopyUpSSE2Check             ; no jump
0012B0B8 73 07             jae       CopyUp+11h (012B0C1h)
   ;
   ; use Enhanced Fast Strings
   rep movsb
0012B0BA F3 A4             rep movs byte ptr es:,byte ptr
   jmp TrailUp0       ; Done
0012B0BC E9 17 03 00 00    jmp       TrailUpVec+10h (012B3D8h)
CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
   ; block size greater than min threshold?
   cmp ecx,080h
0012B0C1 81 F9 80 00 00 00 cmp       ecx,80h
   jb    Dword_align; length too small go use dwords
0012B0C7 0F 82 CE 01 00 00 jb       CopyUp+1EBh (012B29Bh)
   ; alignments equal?
   mov eax,edi
0012B0CD 8B C7             mov       eax,edi
   xor eax,esi
0012B0CF 33 C6             xor       eax,esi
   test eax,15
0012B0D1 A9 0F 00 00 00    test    eax,0Fh
   jne AtomChk ; Not aligned go check Atom
0012B0D6 75 0E             jne       CopyUp+36h (012B0E6h)
   bt    __isa_enabled, __ISA_AVAILABLE_SSE2
0012B0D8 0F BA 25 00 50 17 00 01 bt       dword ptr ds:,1
   jc    VEC_memcpy ; yes, go SSE2 copy (params already set)
0012B0E0 0F 82 DA 04 00 00 jb       TrailDownVec+5Ch (012B5C0h)
AtomChk:
   ; Is Atom supported?
   bt    __favor, __FAVOR_ATOM
0012B0E6 0F BA 25 FC 63 17 00 00 bt       dword ptr ds:,0
   jnc Dword_align ; no,jump
0012B0EE 0F 83 A7 01 00 00 jae       CopyUp+1EBh (012B29Bh)

   ; check if dst is 4 byte aligned
   test edi, 3
0012B0F4 F7 C7 03 00 00 00 test    edi,3
   jne CopyLeadUp
0012B0FA 0F 85 B8 01 00 00 jne       CopyUp+208h (012B2B8h)

   ; check if src is 4 byte aligned
   test esi, 3
0012B100 F7 C6 03 00 00 00 test    esi,3
   jne Dword_align_Ok
0012B106 0F 85 97 01 00 00 jne       CopyUp+1F3h (012B2A3h)

; A software pipelining vectorized memcpy loop using PALIGN instructions

; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
   bt    edi, 2
0012B10C 0F BA E7 02       bt       edi,2
   jae PalignHead8
0012B110 73 0D             jae       CopyUp+6Fh (012B11Fh)
   mov eax, dword ptr
0012B112 8B 06             mov       eax,dword ptr
   sub ecx, 4
0012B114 83 E9 04          sub       ecx,4
   lea esi, byte ptr
0012B117 8D 76 04          lea       esi,
   mov dword ptr , eax
0012B11A 89 07             mov       dword ptr ,eax
   lea edi, byte ptr
0012B11C 8D 7F 04          lea       edi,

PalignHead8:
   bt    edi, 3
0012B11F 0F BA E7 03       bt       edi,3
   jae PalignLoop
0012B123 73 11             jae       CopyUp+86h (012B136h)
   movq xmm1, qword ptr
0012B125 F3 0F 7E 0E       movq    xmm1,mmword ptr
   sub ecx, 8
0012B129 83 E9 08          sub       ecx,8
   lea esi, byte ptr
0012B12C 8D 76 08          lea       esi,
   movq qword ptr , xmm1
0012B12F 66 0F D6 0F       movq    mmword ptr ,xmm1
   lea edi, byte ptr
0012B133 8D 7F 08          lea       edi,

;(2) Use SSE palign loop
PalignLoop:
   test esi, 7
0012B136 F7 C6 07 00 00 00 test    esi,7
   je    MovPalign8
0012B13C 74 63             je       CopyUp+0F1h (012B1A1h)
   bt    esi, 3
0012B13E 0F BA E6 03       bt       esi,3
   jae MovPalign4
0012B142 0F 83 B2 00 00 00 jae       CopyUp+14Ah (012B1FAh)

PALIGN_memcpy 12
0012B148 66 0F 6F 4E F4    movdqa    xmm1,xmmword ptr
0012B14D 8D 76 F4          lea       esi,
PalignLoop12:
0012B150 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B155 83 E9 30          sub       ecx,30h
0012B158 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B15D 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B162 8D 76 30          lea       esi,
0012B165 83 F9 30          cmp       ecx,30h
0012B168 66 0F 6F D3       movdqa    xmm2,xmm3
0012B16C 66 0F 3A 0F D9 0C palignr xmm3,xmm1,0Ch
0012B172 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B176 66 0F 6F E0       movdqa    xmm4,xmm0
0012B17A 66 0F 3A 0F C2 0C palignr xmm0,xmm2,0Ch
0012B180 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B185 66 0F 6F CD       movdqa    xmm1,xmm5
0012B189 66 0F 3A 0F EC 0C palignr xmm5,xmm4,0Ch
0012B18F 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B194 8D 7F 30          lea       edi,
0012B197 7D B7             jge       CopyUp+0A0h (012B150h)
0012B199 8D 76 0C          lea       esi,
   jmp PalignTail
0012B19C E9 AF 00 00 00    jmp       CopyUp+1A0h (012B250h)

PALIGN_memcpy 8
0012B1A1 66 0F 6F 4E F8    movdqa    xmm1,xmmword ptr
0012B1A6 8D 76 F8          lea       esi,
0012B1A9 8D 49 00          lea       ecx,
PalignLoop8:
0012B1AC 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B1B1 83 E9 30          sub       ecx,30h
0012B1B4 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B1B9 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B1BE 8D 76 30          lea       esi,
0012B1C1 83 F9 30          cmp       ecx,30h
0012B1C4 66 0F 6F D3       movdqa    xmm2,xmm3
0012B1C8 66 0F 3A 0F D9 08 palignr xmm3,xmm1,8
0012B1CE 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B1D2 66 0F 6F E0       movdqa    xmm4,xmm0
0012B1D6 66 0F 3A 0F C2 08 palignr xmm0,xmm2,8
0012B1DC 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B1E1 66 0F 6F CD       movdqa    xmm1,xmm5
0012B1E5 66 0F 3A 0F EC 08 palignr xmm5,xmm4,8
0012B1EB 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B1F0 8D 7F 30          lea       edi,
0012B1F3 7D B7             jge       CopyUp+0FCh (012B1ACh)
0012B1F5 8D 76 08          lea       esi,
   jmp PalignTail
0012B1F8 EB 56             jmp       CopyUp+1A0h (012B250h)

PALIGN_memcpy 4
0012B1FA 66 0F 6F 4E FC    movdqa    xmm1,xmmword ptr
0012B1FF 8D 76 FC          lea       esi,
0012B202 8B FF             mov       edi,edi
PalignLoop4:
0012B204 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B209 83 E9 30          sub       ecx,30h
0012B20C 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B211 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B216 8D 76 30          lea       esi,
0012B219 83 F9 30          cmp       ecx,30h
0012B21C 66 0F 6F D3       movdqa    xmm2,xmm3
0012B220 66 0F 3A 0F D9 04 palignr xmm3,xmm1,4
0012B226 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B22A 66 0F 6F E0       movdqa    xmm4,xmm0
0012B22E 66 0F 3A 0F C2 04 palignr xmm0,xmm2,4
0012B234 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B239 66 0F 6F CD       movdqa    xmm1,xmm5
0012B23D 66 0F 3A 0F EC 04 palignr xmm5,xmm4,4
0012B243 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B248 8D 7F 30          lea       edi,
0012B24B 7D B7             jge       CopyUp+154h (012B204h)
0012B24D 8D 76 04          lea       esi,

;(3) Copy the tailing bytes.
PalignTail:
   cmp ecx,10h
0012B250 83 F9 10          cmp       ecx,10h
   jl PalignTail4
0012B253 7C 13             jl       CopyUp+1B8h (012B268h)
   movdqu xmm1,xmmword ptr
0012B255 F3 0F 6F 0E       movdqu    xmm1,xmmword ptr
   sub ecx, 10h
0012B259 83 E9 10          sub       ecx,10h
   lea esi, xmmword ptr
0012B25C 8D 76 10          lea       esi,
   movdqa xmmword ptr ,xmm1
0012B25F 66 0F 7F 0F       movdqa    xmmword ptr ,xmm1
   lea edi, xmmword ptr
0012B263 8D 7F 10          lea       edi,
   jmp PalignTail
0012B266 EB E8             jmp       CopyUp+1A0h (012B250h)

PalignTail4:
   bt    ecx, 2
0012B268 0F BA E1 02       bt       ecx,2
   jae PalignTail8
0012B26C 73 0D             jae       CopyUp+1CBh (012B27Bh)
   mov eax, dword ptr
0012B26E 8B 06             mov       eax,dword ptr
   sub ecx,4
0012B270 83 E9 04          sub       ecx,4
   lea esi, byte ptr
0012B273 8D 76 04          lea       esi,
   mov dword ptr , eax
0012B276 89 07             mov       dword ptr ,eax
   lea edi, byte ptr
0012B278 8D 7F 04          lea       edi,

PalignTail8:
   bt    ecx, 3
0012B27B 0F BA E1 03       bt       ecx,3
   jae PalignTailLE3
0012B27F 73 11             jae       CopyUp+1E2h (012B292h)
   movq xmm1, qword ptr
0012B281 F3 0F 7E 0E       movq    xmm1,mmword ptr
   sub ecx,8
0012B285 83 E9 08          sub       ecx,8
   lea esi, byte ptr
0012B288 8D 76 08          lea       esi,
   movq qword ptr , xmm1
0012B28B 66 0F D6 0F       movq    mmword ptr ,xmm1
   lea edi, byte ptr
0012B28F 8D 7F 08          lea       edi,

PalignTailLE3:
   mov eax, dword ptr TrailUpVec
0012B292 8B 04 8D C8 B3 12 00 mov       eax,dword ptr
   jmp eax
0012B299 FF E0             jmp       eax

; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination.This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;

Dword_align:
   test edi,11b       ;U - destination dword aligned?
0012B29B F7 C7 03 00 00 00 test    edi,3
   jnz short CopyLeadUp ;V - if we are not dword aligned already, align
0012B2A1 75 15             jne       CopyUp+208h (012B2B8h)
Dword_align_Ok:
   shr ecx,2       ;U - shift down to dword count
0012B2A3 C1 E9 02          shr       ecx,2
   and edx,11b       ;V - trailing byte count
0012B2A6 83 E2 03          and       edx,3

   cmp ecx,8       ;U - test if small enough for unwind copy
0012B2A9 83 F9 08          cmp       ecx,8
   jb    short CopyUnwindUp ;V - if so, then jump
0012B2AC 72 2A             jb       CopyUp+228h (012B2D8h)

   rep movsd       ;N - move all of our dwords
0012B2AE F3 A5             rep movs dword ptr es:,dword ptr

   jmp dword ptr TrailUpVec ;N - process trailing bytes
0012B2B0 FF 24 95 C8 B3 12 00 jmp       dword ptr
0012B2B7 90                nop

;
; Code to do optimal memory copies for non-dword-aligned destinations.
;

; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
;    alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
;    be faster to move the bytes with one instruction.
;

   align @WordSize
CopyLeadUp:

   mov eax,edi       ;U - get destination offset
0012B2B8 8B C7             mov       eax,edi
   mov edx,11b       ;V - prepare for mask
0012B2BA BA 03 00 00 00    mov       edx,3

   sub ecx,4       ;U - check for really short string - sub for adjust
0012B2BF 83 E9 04          sub       ecx,4
   jb    short ByteCopyUp ;V - branch to just copy bytes
0012B2C2 72 0C             jb       CopyUp+220h (012B2D0h)

   and eax,11b       ;U - get offset within first dword
0012B2C4 83 E0 03          and       eax,3
   add ecx,eax       ;V - update size after leading bytes copied
0012B2C7 03 C8             add       ecx,eax

   jmp dword ptr LeadUpVec ;N - process leading bytes
0012B2C9 FF 24 85 DC B2 12 00 jmp       dword ptr

   align @WordSize
ByteCopyUp:
   jmp dword ptr TrailUpVec ;N - process just bytes
0012B2D0 FF 24 8D D8 B3 12 00 jmp       dword ptr
0012B2D7 90                nop

   align @WordSize
CopyUnwindUp:
   jmp dword ptr UnwindUpVec ;N - unwind dword copy
0012B2D8 FF 24 8D 5C B3 12 00 jmp       dword ptr
0012B2DF 90                nop
--- 无源文件 -----------------------------------------------------------------------
0012B2E0 EC                in       al,dx
0012B2E1 B2 12             mov       dl,12h
0012B2E3 00 18             add       byte ptr ,bl
0012B2E5 B3 12             mov       bl,12h
0012B2E7 00 3C B3          add       byte ptr ,bh
0012B2EA 12 00             adc       al,byte ptr
LeadUp1:
0012B2EC 23 D1             and       edx,ecx
0012B2EE 8A 06             mov       al,byte ptr
0012B2F0 88 07             mov       byte ptr ,al
0012B2F2 8A 46 01          mov       al,byte ptr
0012B2F5 88 47 01          mov       byte ptr ,al
0012B2F8 8A 46 02          mov       al,byte ptr
0012B2FB C1 E9 02          shr       ecx,2
0012B2FE 88 47 02          mov       byte ptr ,al
0012B301 83 C6 03          add       esi,3
0012B304 83 C7 03          add       edi,3
0012B307 83 F9 08          cmp       ecx,8
0012B30A 72 CC             jb       CopyUp+228h (012B2D8h)
0012B30C F3 A5             rep movs dword ptr es:,dword ptr
0012B30E FF 24 95 C8 B3 12 00 jmp       dword ptr
0012B315 8D 49 00          lea       ecx,
LeadUp2:
0012B318 23 D1             and       edx,ecx
0012B31A 8A 06             mov       al,byte ptr
0012B31C 88 07             mov       byte ptr ,al
0012B31E 8A 46 01          mov       al,byte ptr
0012B321 C1 E9 02          shr       ecx,2
0012B324 88 47 01          mov       byte ptr ,al
0012B327 83 C6 02          add       esi,2
0012B32A 83 C7 02          add       edi,2
0012B32D 83 F9 08          cmp       ecx,8
0012B330 72 A6             jb       CopyUp+228h (012B2D8h)
0012B332 F3 A5             rep movs dword ptr es:,dword ptr
0012B334 FF 24 95 C8 B3 12 00 jmp       dword ptr
0012B33B 90                nop
LeadUp3:
0012B33C 23 D1             and       edx,ecx
0012B33E 8A 06             mov       al,byte ptr
0012B340 88 07             mov       byte ptr ,al
0012B342 83 C6 01          add       esi,1
0012B345 C1 E9 02          shr       ecx,2
0012B348 83 C7 01          add       edi,1
0012B34B 83 F9 08          cmp       ecx,8
0012B34E 72 88             jb       CopyUp+228h (012B2D8h)
0012B350 F3 A5             rep movs dword ptr es:,dword ptr
0012B352 FF 24 95 C8 B3 12 00 jmp       dword ptr
0012B359 8D 49 00          lea       ecx,
0012B35C BF B3 12 00 AC    mov       edi,0AC0012B3h
0012B361 B3 12             mov       bl,12h
0012B363 00 A4 B3 12 00 9C B3 add       byte ptr ,ah
0012B36A 12 00             adc       al,byte ptr
0012B36C 94                xchg    eax,esp
0012B36D B3 12             mov       bl,12h
0012B36F 00 8C B3 12 00 84 B3 add       byte ptr ,cl
0012B376 12 00             adc       al,byte ptr
0012B378 7C B3             jl       LeadUpVec+4Dh (012B32Dh)
0012B37A 12 00             adc       al,byte ptr
UnwindUp7:
0012B37C 8B 44 8E E4       mov       eax,dword ptr
0012B380 89 44 8F E4       mov       dword ptr ,eax
UnwindUp6:
0012B384 8B 44 8E E8       mov       eax,dword ptr
0012B388 89 44 8F E8       mov       dword ptr ,eax
UnwindUp5:
0012B38C 8B 44 8E EC       mov       eax,dword ptr
0012B390 89 44 8F EC       mov       dword ptr ,eax
UnwindUp4:
0012B394 8B 44 8E F0       mov       eax,dword ptr
0012B398 89 44 8F F0       mov       dword ptr ,eax
UnwindUp3:
0012B39C 8B 44 8E F4       mov       eax,dword ptr
0012B3A0 89 44 8F F4       mov       dword ptr ,eax
UnwindUp2:
0012B3A4 8B 44 8E F8       mov       eax,dword ptr
0012B3A8 89 44 8F F8       mov       dword ptr ,eax
UnwindUp1:
0012B3AC 8B 44 8E FC       mov       eax,dword ptr
0012B3B0 89 44 8F FC       mov       dword ptr ,eax
0012B3B4 8D 04 8D 00 00 00 00 lea       eax,
0012B3BB 03 F0             add       esi,eax
0012B3BD 03 F8             add       edi,eax
UnwindUp0:
0012B3BF FF 24 95 C8 B3 12 00 jmp       dword ptr
0012B3C6 8B FF             mov       edi,edi
0012B3C8 D8 B3 12 00 E0 B3 fdiv    dword ptr
0012B3CE 12 00             adc       al,byte ptr
0012B3D0 EC                in       al,dx
0012B3D1 B3 12             mov       bl,12h
0012B3D3 00 00             add       byte ptr ,al
0012B3D5 B4 12             mov       ah,12h
0012B3D7 00 8B 44 24 0C 5E add       byte ptr ,cl
0012B3DD 5F                pop       edi
0012B3DE C3                ret
0012B3DF 90                nop
TrailUp1:
0012B3E0 8A 06             mov       al,byte ptr
0012B3E2 88 07             mov       byte ptr ,al
0012B3E4 8B 44 24 0C       mov       eax,dword ptr
0012B3E8 5E                pop       esi
0012B3E9 5F                pop       edi
0012B3EA C3                ret
0012B3EB 90                nop
TrailUp2:
0012B3EC 8A 06             mov       al,byte ptr
0012B3EE 88 07             mov       byte ptr ,al
0012B3F0 8A 46 01          mov       al,byte ptr
0012B3F3 88 47 01          mov       byte ptr ,al
0012B3F6 8B 44 24 0C       mov       eax,dword ptr
0012B3FA 5E                pop       esi
0012B3FB 5F                pop       edi
0012B3FC C3                ret
0012B3FD 8D 49 00          lea       ecx,
TrailUp3:
0012B400 8A 06             mov       al,byte ptr
0012B402 88 07             mov       byte ptr ,al
0012B404 8A 46 01          mov       al,byte ptr
0012B407 88 47 01          mov       byte ptr ,al
0012B40A 8A 46 02          mov       al,byte ptr
0012B40D 88 47 02          mov       byte ptr ,al
0012B410 8B 44 24 0C       mov       eax,dword ptr
0012B414 5E                pop       esi
0012B415 5F                pop       edi
0012B416 C3                ret
0012B417 90                nop
CopyDown:
0012B418 8D 74 31 FC       lea       esi,
0012B41C 8D 7C 39 FC       lea       edi,
0012B420 F7 C7 03 00 00 00 test    edi,3
0012B426 75 24             jne       TrailUpVec+84h (012B44Ch)
0012B428 C1 E9 02          shr       ecx,2
0012B42B 83 E2 03          and       edx,3
0012B42E 83 F9 08          cmp       ecx,8
0012B431 72 0D             jb       TrailUpVec+78h (012B440h)
0012B433 FD                std
0012B434 F3 A5             rep movs dword ptr es:,dword ptr
0012B436 FC                cld
0012B437 FF 24 95 64 B5 12 00 jmp       dword ptr
0012B43E 8B FF             mov       edi,edi
CopyUnwindDown:
0012B440 F7 D9             neg       ecx
0012B442 FF 24 8D 14 B5 12 00 jmp       dword ptr
0012B449 8D 49 00          lea       ecx,
CopyLeadDown:
0012B44C 8B C7             mov       eax,edi
0012B44E BA 03 00 00 00    mov       edx,3
0012B453 83 F9 04          cmp       ecx,4
0012B456 72 0C             jb       TrailUpVec+9Ch (012B464h)
0012B458 83 E0 03          and       eax,3
0012B45B 2B C8             sub       ecx,eax
0012B45D FF 24 85 68 B4 12 00 jmp       dword ptr
ByteCopyDown:
0012B464 FF 24 8D 64 B5 12 00 jmp       dword ptr
0012B46B 90                nop
0012B46C 78 B4             js       TrailUpVec+5Ah (012B422h)
0012B46E 12 00             adc       al,byte ptr
0012B470 9C                pushfd
0012B471 B4 12             mov       ah,12h
0012B473 00 C4             add       ah,al
0012B475 B4 12             mov       ah,12h
0012B477 00 8A 46 03 23 D1 add       byte ptr ,cl
0012B47D 88 47 03          mov       byte ptr ,al
0012B480 83 EE 01          sub       esi,1
0012B483 C1 E9 02          shr       ecx,2
0012B486 83 EF 01          sub       edi,1
0012B489 83 F9 08          cmp       ecx,8
0012B48C 72 B2             jb       TrailUpVec+78h (012B440h)
0012B48E FD                std
0012B48F F3 A5             rep movs dword ptr es:,dword ptr
0012B491 FC                cld
0012B492 FF 24 95 64 B5 12 00 jmp       dword ptr
0012B499 8D 49 00          lea       ecx,
LeadDown2:
0012B49C 8A 46 03          mov       al,byte ptr
0012B49F 23 D1             and       edx,ecx
0012B4A1 88 47 03          mov       byte ptr ,al
0012B4A4 8A 46 02          mov       al,byte ptr
0012B4A7 C1 E9 02          shr       ecx,2
0012B4AA 88 47 02          mov       byte ptr ,al
0012B4AD 83 EE 02          sub       esi,2
0012B4B0 83 EF 02          sub       edi,2
0012B4B3 83 F9 08          cmp       ecx,8
0012B4B6 72 88             jb       TrailUpVec+78h (012B440h)
0012B4B8 FD                std
0012B4B9 F3 A5             rep movs dword ptr es:,dword ptr
0012B4BB FC                cld
0012B4BC FF 24 95 64 B5 12 00 jmp       dword ptr
0012B4C3 90                nop
LeadDown3:
0012B4C4 8A 46 03          mov       al,byte ptr
0012B4C7 23 D1             and       edx,ecx
0012B4C9 88 47 03          mov       byte ptr ,al
0012B4CC 8A 46 02          mov       al,byte ptr
0012B4CF 88 47 02          mov       byte ptr ,al
0012B4D2 8A 46 01          mov       al,byte ptr
0012B4D5 C1 E9 02          shr       ecx,2
0012B4D8 88 47 01          mov       byte ptr ,al
0012B4DB 83 EE 03          sub       esi,3
0012B4DE 83 EF 03          sub       edi,3
0012B4E1 83 F9 08          cmp       ecx,8
0012B4E4 0F 82 56 FF FF FF jb       TrailUpVec+78h (012B440h)
0012B4EA FD                std
0012B4EB F3 A5             rep movs dword ptr es:,dword ptr
0012B4ED FC                cld
0012B4EE FF 24 95 64 B5 12 00 jmp       dword ptr
0012B4F5 8D 49 00          lea       ecx,
0012B4F8 18 B5 12 00 20 B5 sbb       byte ptr ,dh
0012B4FE 12 00             adc       al,byte ptr
0012B500 28 B5 12 00 30 B5 sub       byte ptr ,dh
0012B506 12 00             adc       al,byte ptr
0012B508 38 B5 12 00 40 B5 cmp       byte ptr ,dh
0012B50E 12 00             adc       al,byte ptr
0012B510 48                dec       eax
0012B511 B5 12             mov       ch,12h
0012B513 00 5B B5          add       byte ptr ,bl
0012B516 12 00             adc       al,byte ptr
UnwindDown7:
0012B518 8B 44 8E 1C       mov       eax,dword ptr
0012B51C 89 44 8F 1C       mov       dword ptr ,eax
UnwindDown6:
0012B520 8B 44 8E 18       mov       eax,dword ptr
0012B524 89 44 8F 18       mov       dword ptr ,eax
UnwindDown5:
0012B528 8B 44 8E 14       mov       eax,dword ptr
0012B52C 89 44 8F 14       mov       dword ptr ,eax
UnwindDown4:
0012B530 8B 44 8E 10       mov       eax,dword ptr
0012B534 89 44 8F 10       mov       dword ptr ,eax
UnwindDown3:
0012B538 8B 44 8E 0C       mov       eax,dword ptr
0012B53C 89 44 8F 0C       mov       dword ptr ,eax
UnwindDown2:
0012B540 8B 44 8E 08       mov       eax,dword ptr
0012B544 89 44 8F 08       mov       dword ptr ,eax
UnwindDown1:
0012B548 8B 44 8E 04       mov       eax,dword ptr
0012B54C 89 44 8F 04       mov       dword ptr ,eax
0012B550 8D 04 8D 00 00 00 00 lea       eax,
0012B557 03 F0             add       esi,eax
0012B559 03 F8             add       edi,eax
UnwindDown0:
0012B55B FF 24 95 64 B5 12 00 jmp       dword ptr
0012B562 8B FF             mov       edi,edi
0012B564 74 B5             je       UnwindDownVec+23h (012B51Bh)
0012B566 12 00             adc       al,byte ptr
0012B568 7C B5             jl       UnwindDownVec+27h (012B51Fh)
0012B56A 12 00             adc       al,byte ptr
0012B56C 8C B5 12 00 A0 B5 mov       word ptr ,st(-2)
0012B572 12 00             adc       al,byte ptr
TrailDown0:
0012B574 8B 44 24 0C       mov       eax,dword ptr
0012B578 5E                pop       esi
0012B579 5F                pop       edi
0012B57A C3                ret
0012B57B 90                nop
TrailDown1:
0012B57C 8A 46 03          mov       al,byte ptr
0012B57F 88 47 03          mov       byte ptr ,al
0012B582 8B 44 24 0C       mov       eax,dword ptr
0012B586 5E                pop       esi
0012B587 5F                pop       edi
0012B588 C3                ret
0012B589 8D 49 00          lea       ecx,
TrailDown2:
0012B58C 8A 46 03          mov       al,byte ptr
0012B58F 88 47 03          mov       byte ptr ,al
0012B592 8A 46 02          mov       al,byte ptr
0012B595 88 47 02          mov       byte ptr ,al
0012B598 8B 44 24 0C       mov       eax,dword ptr
0012B59C 5E                pop       esi
0012B59D 5F                pop       edi
0012B59E C3                ret
0012B59F 90                nop
TrailDown3:
0012B5A0 8A 46 03          mov       al,byte ptr
0012B5A3 88 47 03          mov       byte ptr ,al
0012B5A6 8A 46 02          mov       al,byte ptr
0012B5A9 88 47 02          mov       byte ptr ,al
0012B5AC 8A 46 01          mov       al,byte ptr
0012B5AF 88 47 01          mov       byte ptr ,al
0012B5B2 8B 44 24 0C       mov       eax,dword ptr
0012B5B6 5E                pop       esi
0012B5B7 5F                pop       edi
0012B5B8 C3                ret
0012B5B9 8D A4 24 00 00 00 00 lea       esp,
VEC_memcpy:
0012B5C0 57                push    edi
0012B5C1 8B C6             mov       eax,esi
0012B5C3 83 E0 0F          and       eax,0Fh
0012B5C6 85 C0             test    eax,eax
0012B5C8 0F 85 D2 00 00 00 jne       TrailDownVec+13Ch (012B6A0h)
L_Aligned:
0012B5CE 8B D1             mov       edx,ecx
0012B5D0 83 E1 7F          and       ecx,7Fh
0012B5D3 C1 EA 07          shr       edx,7
0012B5D6 74 65             je       TrailDownVec+0D9h (012B63Dh)
0012B5D8 8D A4 24 00 00 00 00 lea       esp,
0012B5DF 90                nop
L_1:
0012B5E0 66 0F 6F 06       movdqa    xmm0,xmmword ptr
0012B5E4 66 0F 6F 4E 10    movdqa    xmm1,xmmword ptr
0012B5E9 66 0F 6F 56 20    movdqa    xmm2,xmmword ptr
0012B5EE 66 0F 6F 5E 30    movdqa    xmm3,xmmword ptr
0012B5F3 66 0F 7F 07       movdqa    xmmword ptr ,xmm0
0012B5F7 66 0F 7F 4F 10    movdqa    xmmword ptr ,xmm1
0012B5FC 66 0F 7F 57 20    movdqa    xmmword ptr ,xmm2
0012B601 66 0F 7F 5F 30    movdqa    xmmword ptr ,xmm3
0012B606 66 0F 6F 66 40    movdqa    xmm4,xmmword ptr
0012B60B 66 0F 6F 6E 50    movdqa    xmm5,xmmword ptr
0012B610 66 0F 6F 76 60    movdqa    xmm6,xmmword ptr
0012B615 66 0F 6F 7E 70    movdqa    xmm7,xmmword ptr
0012B61A 66 0F 7F 67 40    movdqa    xmmword ptr ,xmm4
0012B61F 66 0F 7F 6F 50    movdqa    xmmword ptr ,xmm5
0012B624 66 0F 7F 77 60    movdqa    xmmword ptr ,xmm6
0012B629 66 0F 7F 7F 70    movdqa    xmmword ptr ,xmm7
0012B62E 8D B6 80 00 00 00 lea       esi,
0012B634 8D BF 80 00 00 00 lea       edi,
0012B63A 4A                dec       edx
0012B63B 75 A3             jne       TrailDownVec+7Ch (012B5E0h)
L_1a:
0012B63D 85 C9             test    ecx,ecx
0012B63F 74 4F             je       TrailDownVec+12Ch (012B690h)
0012B641 8B D1             mov       edx,ecx
0012B643 C1 EA 04          shr       edx,4
0012B646 85 D2             test    edx,edx
0012B648 74 17             je       TrailDownVec+0FDh (012B661h)
0012B64A 8D 9B 00 00 00 00 lea       ebx,
L_2:
0012B650 66 0F 6F 06       movdqa    xmm0,xmmword ptr
0012B654 66 0F 7F 07       movdqa    xmmword ptr ,xmm0
0012B658 8D 76 10          lea       esi,
0012B65B 8D 7F 10          lea       edi,
0012B65E 4A                dec       edx
0012B65F 75 EF             jne       TrailDownVec+0ECh (012B650h)
L_Trailing:
0012B661 83 E1 0F          and       ecx,0Fh
0012B664 74 2A             je       TrailDownVec+12Ch (012B690h)
0012B666 8B C1             mov       eax,ecx
0012B668 C1 E9 02          shr       ecx,2
0012B66B 74 0D             je       TrailDownVec+116h (012B67Ah)
L_TrailDword:
0012B66D 8B 16             mov       edx,dword ptr
0012B66F 89 17             mov       dword ptr ,edx
0012B671 8D 76 04          lea       esi,
0012B674 8D 7F 04          lea       edi,
0012B677 49                dec       ecx
0012B678 75 F3             jne       TrailDownVec+109h (012B66Dh)
L_TrailBytes:
0012B67A 8B C8             mov       ecx,eax
0012B67C 83 E1 03          and       ecx,3
0012B67F 74 0F             je       TrailDownVec+12Ch (012B690h)
L_TrailNextByte:
0012B681 8A 06             mov       al,byte ptr
0012B683 88 07             mov       byte ptr ,al
0012B685 46                inc       esi
0012B686 47                inc       edi
0012B687 49                dec       ecx
0012B688 75 F7             jne       TrailDownVec+11Dh (012B681h)
0012B68A 8D 9B 00 00 00 00 lea       ebx,
L_Return:
0012B690 58                pop       eax
0012B691 5E                pop       esi
0012B692 5F                pop       edi
0012B693 C3                ret
0012B694 8D A4 24 00 00 00 00 lea       esp,
0012B69B EB 03             jmp       TrailDownVec+13Ch (012B6A0h)
0012B69D CC                int       3
0012B69E CC                int       3
0012B69F CC                int       3
L_Notaligned:
0012B6A0 BA 10 00 00 00    mov       edx,10h
0012B6A5 2B D0             sub       edx,eax
0012B6A7 2B CA             sub       ecx,edx
0012B6A9 51                push    ecx
0012B6AA 8B C2             mov       eax,edx
0012B6AC 8B C8             mov       ecx,eax
0012B6AE 83 E1 03          and       ecx,3
0012B6B1 74 09             je       TrailDownVec+158h (012B6BCh)
L_Byte:
0012B6B3 8A 16             mov       dl,byte ptr
0012B6B5 88 17             mov       byte ptr ,dl
0012B6B7 46                inc       esi
0012B6B8 47                inc       edi
0012B6B9 49                dec       ecx
0012B6BA 75 F7             jne       TrailDownVec+14Fh (012B6B3h)
L_MovDword:
0012B6BC C1 E8 02          shr       eax,2
0012B6BF 74 0D             je       TrailDownVec+16Ah (012B6CEh)
L_Dword:
0012B6C1 8B 16             mov       edx,dword ptr
0012B6C3 89 17             mov       dword ptr ,edx
0012B6C5 8D 76 04          lea       esi,
0012B6C8 8D 7F 04          lea       edi,
0012B6CB 48                dec       eax
0012B6CC 75 F3             jne       TrailDownVec+15Dh (012B6C1h)
L_Adjustcnt:
0012B6CE 59                pop       ecx
0012B6CF E9 FA FE FF FF    jmp       TrailDownVec+6Ah (012B5CEh)一个memcpy竟然有700+行，这简直比想象中的长得多，没错，确实不是简简单单地一个rep movsb就复制了内存的。它为什么这样做呢？我来分析一下。
首先开头的废话先不看，这里面有这样的几行注释：

;
; Check for overlapping buffers:
;    If (dst <= src) Or (dst >= src + Count) Then
;             Do normal (Upwards) Copy
;    Else
;             Do Downwards Copy to avoid propagation
;

我给翻译一下：检查缓冲区是否有重叠：
如果目标在源的前面或者目标不在源的长度范围内那么
   做通常的（向后的）复制
否则
   做向前的复制来避免错误复制（复制成片段重复的内容）（虽然括号里的内容是脑补翻译。此外注意我把“向上复制”翻译为“向后复制”，“向下复制”翻译为“向前复制”，请留意）

嗯虽说C语言规范说memmove能保证缓冲区重叠也能正常复制而memcpy则行为未定义，但VS2012还是做了保守的处理。毕竟比起性能消耗（而且反正后面是有优化的，我接下来讲），少一个坑是一个坑。

接下来从CopyUp开始看：这是正常的向后复制的代码实现。

然后，它先检测“增强快速串处理”是否支持，是的话，直接rep movsb

   ; See if Enhanced Fast Strings is supported.
   ; ENFSTRG supported?
   bt    __favor, __FAVOR_ENFSTRG
   jnc CopyUpSSE2Check             ; no jump
   ;
   ; use Enhanced Fast Strings
   rep movsb
   jmp TrailUp0       ; Done

嗯也就是CPU如果没有这个功能的话，它还有其它的优化手段。可以从jnc CopyUpSSE2Check这条指令看出，它会检测CPU是否支持SSE2指令集，然后使用SSE2指令集进行加速复制。

CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
   ; block size greater than min threshold?
   cmp ecx,080h
   jb    Dword_align; length too small go use dwords
   ; alignments equal?
   mov eax,edi
   xor eax,esi
   test eax,15
   jne AtomChk ; Not aligned go check Atom
   bt    __isa_enabled, __ISA_AVAILABLE_SSE2
   jc    VEC_memcpy ; yes, go SSE2 copy (params already set)

先判断要复制的内容是否超过128字节，没有的话，因为此时再用SSE2的加速意义不大，此时它会跳到使用REP MOVSD的地方进行复制。
然后检查源地址和目标地址是不是都是16字节对齐的，如果没有对齐的话，检查CPU是不是Atom的，如果是对齐的，检查SSE2是否可用，可用的话直接跳去用SSE2的指令。

AtomChk:
   ; Is Atom supported?
   bt    __favor, __FAVOR_ATOM
   jnc Dword_align ; no,jump

   ; check if dst is 4 byte aligned
   test edi, 3
   jne CopyLeadUp

   ; check if src is 4 byte aligned
   test esi, 3
   jne Dword_align_Ok

如果不是Atom处理器的话，直接跳到使用REP MOVSD的地方进行复制。
如果是的话，检查目标是否4字节对齐，不是的话跳到CopyLeadUp（CopyLeadUp的入口处的注释写的是“用于对未对齐的数据进行复制”）
然后再检查源是否4字节对齐，不是的话跳到使用REP MOVSD的、已经判断过目标是否对齐的那个地方继续执行。

; A software pipelining vectorized memcpy loop using PALIGN instructions

; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
   bt    edi, 2
   jae PalignHead8
   mov eax, dword ptr
   sub ecx, 4
   lea esi, byte ptr
   mov dword ptr , eax
   lea edi, byte ptr

此处是：一个使用PALIGN指令的软件处理管线的向量化内存循环
（1）复制开头的几个字节来让目标地址对齐到最近的16字节边界
4字节对齐 -> 12字节复制；8字节对齐 -> 8字节复制；12字节对齐->4字节复制
判断目标地址是否已经是4字节对齐了，是的话进行8字节对齐。
4字节对齐，直接mov搞定。

PalignHead8:
   bt    edi, 3
   jae PalignLoop
   movq xmm1, qword ptr
   sub ecx, 8
   lea esi, byte ptr
   movq qword ptr , xmm1
   lea edi, byte ptr

8字节对齐，判断目标地址是否已经8字节对齐了，是的话直接进入复制循环。
不是的话，拿movq指令做一次8字节复制搞定。

;(2) Use SSE palign loop
PalignLoop:
   test esi, 7
   je    MovPalign8
   bt    esi, 3
   jae MovPalign4

这里判断源是否8字节对齐，是的话进行8字节复制；如果源是4字节对齐的，进行4字节复制，否则进行12字节复制。

PALIGN_memcpy 12
0012B148 66 0F 6F 4E F4    movdqa    xmm1,xmmword ptr
0012B14D 8D 76 F4          lea       esi,
PalignLoop12:
0012B150 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B155 83 E9 30          sub       ecx,30h
0012B158 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B15D 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B162 8D 76 30          lea       esi,
0012B165 83 F9 30          cmp       ecx,30h
0012B168 66 0F 6F D3       movdqa    xmm2,xmm3
0012B16C 66 0F 3A 0F D9 0C palignr xmm3,xmm1,0Ch
0012B172 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B176 66 0F 6F E0       movdqa    xmm4,xmm0
0012B17A 66 0F 3A 0F C2 0C palignr xmm0,xmm2,0Ch
0012B180 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B185 66 0F 6F CD       movdqa    xmm1,xmm5
0012B189 66 0F 3A 0F EC 0C palignr xmm5,xmm4,0Ch
0012B18F 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B194 8D 7F 30          lea       edi,
0012B197 7D B7             jge       CopyUp+0A0h (012B150h)
0012B199 8D 76 0C          lea       esi,
   jmp PalignTail
0012B19C E9 AF 00 00 00    jmp       CopyUp+1A0h (012B250h)

上面这一部分似乎没有复制方式的源码，目测masm钦定的宏干的事儿。这是12字节对齐复制。

PALIGN_memcpy 8
0012B1A1 66 0F 6F 4E F8    movdqa    xmm1,xmmword ptr
0012B1A6 8D 76 F8          lea       esi,
0012B1A9 8D 49 00          lea       ecx,
PalignLoop8:
0012B1AC 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B1B1 83 E9 30          sub       ecx,30h
0012B1B4 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B1B9 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B1BE 8D 76 30          lea       esi,
0012B1C1 83 F9 30          cmp       ecx,30h
0012B1C4 66 0F 6F D3       movdqa    xmm2,xmm3
0012B1C8 66 0F 3A 0F D9 08 palignr xmm3,xmm1,8
0012B1CE 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B1D2 66 0F 6F E0       movdqa    xmm4,xmm0
0012B1D6 66 0F 3A 0F C2 08 palignr xmm0,xmm2,8
0012B1DC 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B1E1 66 0F 6F CD       movdqa    xmm1,xmm5
0012B1E5 66 0F 3A 0F EC 08 palignr xmm5,xmm4,8
0012B1EB 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B1F0 8D 7F 30          lea       edi,
0012B1F3 7D B7             jge       CopyUp+0FCh (012B1ACh)
0012B1F5 8D 76 08          lea       esi,
   jmp PalignTail
0012B1F8 EB 56             jmp       CopyUp+1A0h (012B250h)

上面这里是8字节对齐的复制。

PALIGN_memcpy 4
0012B1FA 66 0F 6F 4E FC    movdqa    xmm1,xmmword ptr
0012B1FF 8D 76 FC          lea       esi,
0012B202 8B FF             mov       edi,edi
PalignLoop4:
0012B204 66 0F 6F 5E 10    movdqa    xmm3,xmmword ptr
0012B209 83 E9 30          sub       ecx,30h
0012B20C 66 0F 6F 46 20    movdqa    xmm0,xmmword ptr
0012B211 66 0F 6F 6E 30    movdqa    xmm5,xmmword ptr
0012B216 8D 76 30          lea       esi,
0012B219 83 F9 30          cmp       ecx,30h
0012B21C 66 0F 6F D3       movdqa    xmm2,xmm3
0012B220 66 0F 3A 0F D9 04 palignr xmm3,xmm1,4
0012B226 66 0F 7F 1F       movdqa    xmmword ptr ,xmm3
0012B22A 66 0F 6F E0       movdqa    xmm4,xmm0
0012B22E 66 0F 3A 0F C2 04 palignr xmm0,xmm2,4
0012B234 66 0F 7F 47 10    movdqa    xmmword ptr ,xmm0
0012B239 66 0F 6F CD       movdqa    xmm1,xmm5
0012B23D 66 0F 3A 0F EC 04 palignr xmm5,xmm4,4
0012B243 66 0F 7F 6F 20    movdqa    xmmword ptr ,xmm5
0012B248 8D 7F 30          lea       edi,
0012B24B 7D B7             jge       CopyUp+154h (012B204h)
0012B24D 8D 76 04          lea       esi,

上面这里是4字节对齐复制。

;(3) Copy the tailing bytes.
PalignTail:
   cmp ecx,10h
   jl PalignTail4
   movdqu xmm1,xmmword ptr
   sub ecx, 10h
   lea esi, xmmword ptr
   movdqa xmmword ptr ,xmm1
   lea edi, xmmword ptr
   jmp PalignTail

上面这里是复制结尾的字节。

PalignTail4:
   bt    ecx, 2
   jae PalignTail8
   mov eax, dword ptr
   sub ecx,4
   lea esi, byte ptr
   mov dword ptr , eax
   lea edi, byte ptr

PalignTail8:
   bt    ecx, 3
   jae PalignTailLE3
   movq xmm1, qword ptr
   sub ecx,8
   lea esi, byte ptr
   movq qword ptr , xmm1
   lea edi, byte ptr

PalignTailLE3:
   mov eax, dword ptr TrailUpVec
   jmp eax

上面的代码，是根据对齐的方式做最后的处理。

; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination.This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;

Dword_align:
   test edi,11b       ;U - destination dword aligned?
   jnz short CopyLeadUp ;V - if we are not dword aligned already, align
Dword_align_Ok:
   shr ecx,2       ;U - shift down to dword count
   and edx,11b       ;V - trailing byte count

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindUp ;V - if so, then jump

   rep movsd       ;N - move all of our dwords

   jmp dword ptr TrailUpVec ;N - process trailing bytes

这里就是之前所说的使用REP MOVSD的地方进行复制。但即使是使用REP MOVSD，也要判断地址是否是对齐的，只有在对齐的状态下，才适合这样复制。不然会掉速。

;
; Code to do optimal memory copies for non-dword-aligned destinations.
;

; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
;    alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
;    be faster to move the bytes with one instruction.
;

   align @WordSize
CopyLeadUp:

   mov eax,edi       ;U - get destination offset
   mov edx,11b       ;V - prepare for mask

   sub ecx,4       ;U - check for really short string - sub for adjust
   jb    short ByteCopyUp ;V - branch to just copy bytes

   and eax,11b       ;U - get offset within first dword
   add ecx,eax       ;V - update size after leading bytes copied

   jmp dword ptr LeadUpVec ;N - process leading bytes

   align @WordSize
ByteCopyUp:
   jmp dword ptr TrailUpVec ;N - process just bytes

   align @WordSize
CopyUnwindUp:
   jmp dword ptr UnwindUpVec ;N - unwind dword copy

此处的代码负责对没有进行4字节对齐的地址的数据的复制操作进行优化。后面的检测代码存在的原因有两个：
1、确保要复制的数据的长度超出任何可能的对齐长度
2、跳过多个复制逻辑让小的复制能更快一些

后面的部分没有源文件我也不分析了，不过值得留意的一个片段，我这里专门提一下：

L_1:
0012B5E0 66 0F 6F 06       movdqa    xmm0,xmmword ptr
0012B5E4 66 0F 6F 4E 10    movdqa    xmm1,xmmword ptr
0012B5E9 66 0F 6F 56 20    movdqa    xmm2,xmmword ptr
0012B5EE 66 0F 6F 5E 30    movdqa    xmm3,xmmword ptr
0012B5F3 66 0F 7F 07       movdqa    xmmword ptr ,xmm0
0012B5F7 66 0F 7F 4F 10    movdqa    xmmword ptr ,xmm1
0012B5FC 66 0F 7F 57 20    movdqa    xmmword ptr ,xmm2
0012B601 66 0F 7F 5F 30    movdqa    xmmword ptr ,xmm3
0012B606 66 0F 6F 66 40    movdqa    xmm4,xmmword ptr
0012B60B 66 0F 6F 6E 50    movdqa    xmm5,xmmword ptr
0012B610 66 0F 6F 76 60    movdqa    xmm6,xmmword ptr
0012B615 66 0F 6F 7E 70    movdqa    xmm7,xmmword ptr
0012B61A 66 0F 7F 67 40    movdqa    xmmword ptr ,xmm4
0012B61F 66 0F 7F 6F 50    movdqa    xmmword ptr ,xmm5
0012B624 66 0F 7F 77 60    movdqa    xmmword ptr ,xmm6
0012B629 66 0F 7F 7F 70    movdqa    xmmword ptr ,xmm7

这里面使用了8个SSE2寄存器进行复制操作，每个寄存器能存储16个字节，用这种方式进行复制的时候，这些movdqa指令其实有可能是并发执行的，尤其是你的内存是多通道的情况。此处进行数据量大的内容的复制的时候，效率应该会有很大的提升。

VS2012的memcpy函数，根据数据的量的大小对齐的情况，对每一种情况都进行了优化处理，各方面证明了它是无可替代的。不是什么时候，自己造的轮子都比别人的好。

cyycoish 发表于 2017-5-30 17:10:53

这个帖子简直漂亮！
这边发一个别人用C实现的malloc，当然，malloc也不是那么简单的！
ftp://gee.cs.oswego.edu/pub/misc/malloc.c
注意这个开源代码协议是public domain。

Ayala 发表于 2017-5-30 23:30:45

   page ,132
   title memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
;    Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
;    memcpy() copies a source memory buffer to a destination buffer.
;    Overlapping buffers are not treated specially, so propogation may occur.
;    memmove() copies a source memory buffer to a destination buffer.
;    Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************

   .xlist
   include cruntime.inc
   .list
   .xmm

M_EXITmacro
   ret                   ; _cdecl return
   endm ; M_EXIT

PALIGN_memcpy macro d
MovPalign&d&:
   movdqa    xmm1,xmmword ptr
   lea       esi, byte ptr
align @WordSize
PalignLoop&d&:
   movdqaxmm3,xmmword ptr
   sub ecx,30h
   movdqaxmm0,xmmword ptr
   movdqaxmm5,xmmword ptr
   lea esi, xmmword ptr
   cmp ecx,30h
   movdqaxmm2,xmm3

   palignr xmm3,xmm1,d

   movdqaxmmword ptr ,xmm3
   movdqaxmm4,xmm0

   palignr xmm0,xmm2,d

   movdqaxmmword ptr ,xmm0
   movdqaxmm1,xmm5

   palignr xmm5,xmm4,d

   movdqaxmmword ptr ,xmm5
   lea edi, xmmword ptr
   jge PalignLoop&d&
   lea esi, xmmword ptr

   endm ; PALIGN_memcpy

   CODESEG

extrn __isa_available:dword
extrn __isa_enabled:dword
extrn __favor:dword

page
;***
;memcpy - Copy source buffer to destination buffer
;
;Purpose:
;    memcpy() copies a source memory buffer to a destination memory buffer.
;    This routine does NOT recognize overlapping buffers, and thus can lead
;    to propogation.
;    For cases where propogation must be avoided, memmove() must be used.
;
;    Algorithm:
;
;       Same as memmove. See Below
;
;
;memmove - Copy source buffer to destination buffer
;
;Purpose:
;    memmove() copies a source memory buffer to a destination memory buffer.
;    This routine recognize overlapping buffers to avoid propogation.
;    For cases where propogation is not a problem, memcpy() can be used.
;
; Algorithm:
;
;    void * memmove(void * dst, void * src, size_t count)
;    {
;             void * ret = dst;
;
;             if (dst <= src || dst >= (src + count)) {
;                   /*
;                      * Non-Overlapping Buffers
;                      * copy from lower addresses to higher addresses
;                      */
;                   while (count--)
;                            *dst++ = *src++;
;                   }
;             else {
;                   /*
;                      * Overlapping Buffers
;                      * copy from higher addresses to lower addresses
;                      */
;                   dst += count - 1;
;                   src += count - 1;
;
;                   while (count--)
;                            *dst-- = *src--;
;                   }
;
;             return(ret);
;    }
;
;
;Entry:
;    void *dst = pointer to destination buffer
;    const void *src = pointer to source buffer
;    size_t count = number of bytes to copy
;
;Exit:
;    Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
;    CX, DX
;
;Exceptions:
;*******************************************************************************

ifdef MEM_MOVE
   _MEM_ equ <memmove>
else; MEM_MOVE
   _MEM_ equ <memcpy>
endif; MEM_MOVE

%    public_MEM_
_MEM_ proc \
   dst:ptr byte, \
   src:ptr byte, \
   count:IWORD

   ; destination pointer
   ; source pointer
   ; number of bytes to copy

   OPTION PROLOGUE:NONE, EPILOGUE:NONE

   push edi          ;U - save edi
   push esi          ;V - save esi

;                size param/4 prolog byte#reg saved
   .FPO ( 0, 3       , $-_MEM_ , 2, 0, 0 )

   mov esi, ;U - esi = source
   mov ecx, ;V - ecx = number of bytes to move
   mov edi,    ;U - edi = dest

;
; Check for overlapping buffers:
;    If (dst <= src) Or (dst >= src + Count) Then
;             Do normal (Upwards) Copy
;    Else
;             Do Downwards Copy to avoid propagation
;

   mov eax,ecx       ;V - eax = byte count...

   mov edx,ecx       ;U - edx = byte count...
   add eax,esi       ;V - eax = point past source end

   cmp edi,esi       ;U - dst <= src ?
   jbe short CopyUp ;V - yes, copy toward higher addresses

   cmp edi,eax       ;U - dst < (src + count) ?
   jb    CopyDown    ;V - yes, copy toward lower addresses

;
; Copy toward higher addresses.
;
CopyUp:
;
   ; See if Enhanced Fast Strings is supported.
   ; ENFSTRG supported?
   bt    __favor, __FAVOR_ENFSTRG
   jnc CopyUpSSE2Check             ; no jump
   ;
   ; use Enhanced Fast Strings
   rep movsb
   jmp TrailUp0       ; Done
CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
   ; block size greater than min threshold?
   cmp ecx,080h
   jb    Dword_align; length too small go use dwords
   ; alignments equal?
   mov eax,edi
   xor eax,esi
   test eax,15
   jne AtomChk ; Not aligned go check Atom
   bt    __isa_enabled, __ISA_AVAILABLE_SSE2
   jc    VEC_memcpy ; yes, go SSE2 copy (params already set)
AtomChk:
   ; Is Atom supported?
   bt    __favor, __FAVOR_ATOM
   jnc Dword_align ; no,jump

   ; check if dst is 4 byte aligned
   test edi, 3
   jne CopyLeadUp

   ; check if src is 4 byte aligned
   test esi, 3
   jne Dword_align_Ok

; A software pipelining vectorized memcpy loop using PALIGN instructions

; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
   bt    edi, 2
   jae PalignHead8
   mov eax, dword ptr
   sub ecx, 4
   lea esi, byte ptr
   mov dword ptr , eax
   lea edi, byte ptr

PalignHead8:
   bt    edi, 3
   jae PalignLoop
   movq xmm1, qword ptr
   sub ecx, 8
   lea esi, byte ptr
   movq qword ptr , xmm1
   lea edi, byte ptr

;(2) Use SSE palign loop
PalignLoop:
   test esi, 7
   je    MovPalign8
   bt    esi, 3
   jae MovPalign4

PALIGN_memcpy 12
   jmp PalignTail

PALIGN_memcpy 8
   jmp PalignTail

PALIGN_memcpy 4

;(3) Copy the tailing bytes.
PalignTail:
   cmp ecx,10h
   jl PalignTail4
   movdqu xmm1,xmmword ptr
   sub ecx, 10h
   lea esi, xmmword ptr
   movdqa xmmword ptr ,xmm1
   lea edi, xmmword ptr
   jmp PalignTail

PalignTail4:
   bt    ecx, 2
   jae PalignTail8
   mov eax, dword ptr
   sub ecx,4
   lea esi, byte ptr
   mov dword ptr , eax
   lea edi, byte ptr

PalignTail8:
   bt    ecx, 3
   jae PalignTailLE3
   movq xmm1, qword ptr
   sub ecx,8
   lea esi, byte ptr
   movq qword ptr , xmm1
   lea edi, byte ptr

PalignTailLE3:
   mov eax, dword ptr TrailUpVec
   jmp eax

; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination.This
; occurs in 3 steps.
;
; - move x = ((4 - Dest & 3) & 3) bytes
; - move y = ((L-x) >> 2) dwords
; - move (L - x - y*4) bytes
;

Dword_align:
   test edi,11b       ;U - destination dword aligned?
   jnz short CopyLeadUp ;V - if we are not dword aligned already, align
Dword_align_Ok:
   shr ecx,2       ;U - shift down to dword count
   and edx,11b       ;V - trailing byte count

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindUp ;V - if so, then jump

   rep movsd       ;N - move all of our dwords

   jmp dword ptr TrailUpVec ;N - process trailing bytes

;
; Code to do optimal memory copies for non-dword-aligned destinations.
;

; The following length check is done for two reasons:
;
; 1. to ensure that the actual move length is greater than any possiale
;    alignment move, and
;
; 2. to skip the multiple move logic for small moves where it would
;    be faster to move the bytes with one instruction.
;

   align @WordSize
CopyLeadUp:

   mov eax,edi       ;U - get destination offset
   mov edx,11b       ;V - prepare for mask

   sub ecx,4       ;U - check for really short string - sub for adjust
   jb    short ByteCopyUp ;V - branch to just copy bytes

   and eax,11b       ;U - get offset within first dword
   add ecx,eax       ;V - update size after leading bytes copied

   jmp dword ptr LeadUpVec ;N - process leading bytes

   align @WordSize
ByteCopyUp:
   jmp dword ptr TrailUpVec ;N - process just bytes

   align @WordSize
CopyUnwindUp:
   jmp dword ptr UnwindUpVec ;N - unwind dword copy

   align @WordSize
LeadUpVec    dd    LeadUp1, LeadUp2, LeadUp3

   align @WordSize
LeadUp1:
   and edx,ecx       ;U - trailing byte count
   mov al,    ;V - get first byte from source

   mov ,al    ;U - write second byte to destination
   mov al,    ;V - get second byte from source

   mov ,al    ;U - write second byte to destination
   mov al,    ;V - get third byte from source

   shr ecx,2       ;U - shift down to dword count
   mov ,al    ;V - write third byte to destination

   add esi,3       ;U - advance source pointer
   add edi,3       ;V - advance destination pointer

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindUp ;V - if so, then jump

   rep movsd       ;N - move all of our dwords

   jmp dword ptr TrailUpVec ;N - process trailing bytes

   align @WordSize
LeadUp2:
   and edx,ecx       ;U - trailing byte count
   mov al,    ;V - get first byte from source

   mov ,al    ;U - write second byte to destination
   mov al,    ;V - get second byte from source

   shr ecx,2       ;U - shift down to dword count
   mov ,al    ;V - write second byte to destination

   add esi,2       ;U - advance source pointer
   add edi,2       ;V - advance destination pointer

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindUp ;V - if so, then jump

   rep movsd       ;N - move all of our dwords

   jmp dword ptr TrailUpVec ;N - process trailing bytes

   align @WordSize
LeadUp3:
   and edx,ecx       ;U - trailing byte count
   mov al,    ;V - get first byte from source

   mov ,al    ;U - write second byte to destination
   add esi,1       ;V - advance source pointer

   shr ecx,2       ;U - shift down to dword count
   add edi,1       ;V - advance destination pointer

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindUp ;V - if so, then jump

   rep movsd       ;N - move all of our dwords

   jmp dword ptr TrailUpVec ;N - process trailing bytes

   align @WordSize
UnwindUpVec dd    UnwindUp0, UnwindUp1, UnwindUp2, UnwindUp3
            dd    UnwindUp4, UnwindUp5, UnwindUp6, UnwindUp7

UnwindUp7:
   mov eax, ;U - get dword from source
                              ;V - spare
   mov ,eax ;U - put dword into destination
UnwindUp6:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindUp5:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindUp4:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindUp3:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindUp2:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindUp1:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination

   lea eax, ;V - compute update for pointer

   add esi,eax       ;U - update source pointer
   add edi,eax       ;V - update destination pointer
UnwindUp0:
   jmp dword ptr TrailUpVec ;N - process trailing bytes

;-----------------------------------------------------------------------------

   align @WordSize
TrailUpVec    dd    TrailUp0, TrailUp1, TrailUp2, TrailUp3

   align @WordSize
TrailUp0:
   mov eax, ;U - return pointer to destination
   pop esi          ;V - restore esi
   pop edi          ;U - restore edi
                           ;V - spare
   M_EXIT

   align @WordSize
TrailUp1:
   mov al,    ;U - get byte from source
                           ;V - spare
   mov ,al    ;U - put byte in destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

   align @WordSize
TrailUp2:
   mov al,    ;U - get first byte from source
                           ;V - spare
   mov ,al    ;U - put first byte into destination
   mov al,    ;V - get second byte from source
   mov ,al    ;U - put second byte into destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

   align @WordSize
TrailUp3:
   mov al,    ;U - get first byte from source
                           ;V - spare
   mov ,al    ;U - put first byte into destination
   mov al,    ;V - get second byte from source
   mov ,al    ;U - put second byte into destination
   mov al,    ;V - get third byte from source
   mov ,al    ;U - put third byte into destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------

;
; Copy down to avoid propogation in overlapping buffers.
;
   align @WordSize
CopyDown:
   lea esi, ;U - point to 4 bytes before src buffer end
   lea edi, ;V - point to 4 bytes before dest buffer end
;
; See if the destination start is dword aligned
;

   test edi,11b       ;U - test if dword aligned
   jnz short CopyLeadDown ;V - if not, jump

   shr ecx,2       ;U - shift down to dword count
   and edx,11b       ;V - trailing byte count

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindDown ;V - if so, then jump

   std                   ;N - set direction flag
   rep movsd       ;N - move all of our dwords
   cld                   ;N - clear direction flag back

   jmp dword ptr TrailDownVec ;N - process trailing bytes

   align @WordSize
CopyUnwindDown:
   neg ecx          ;U - negate dword count for table merging
                           ;V - spare

   jmp dword ptr UnwindDownVec ;N - unwind copy

   align @WordSize
CopyLeadDown:

   mov eax,edi       ;U - get destination offset
   mov edx,11b       ;V - prepare for mask

   cmp ecx,4       ;U - check for really short string
   jb    short ByteCopyDown ;V - branch to just copy bytes

   and eax,11b       ;U - get offset within first dword
   sub ecx,eax       ;U - to update size after lead copied

   jmp dword ptr LeadDownVec ;N - process leading bytes

   align @WordSize
ByteCopyDown:
   jmp dword ptr TrailDownVec ;N - process just bytes

   align @WordSize
LeadDownVec dd    LeadDown1, LeadDown2, LeadDown3

   align @WordSize
LeadDown1:
   mov al,    ;U - load first byte
   and edx,ecx       ;V - trailing byte count

   mov ,al    ;U - write out first byte
   sub esi,1       ;V - point to last src dword

   shr ecx,2       ;U - shift down to dword count
   sub edi,1       ;V - point to last dest dword

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindDown ;V - if so, then jump

   std                   ;N - set direction flag
   rep movsd       ;N - move all of our dwords
   cld                   ;N - clear direction flag

   jmp dword ptr TrailDownVec ;N - process trailing bytes

   align @WordSize
LeadDown2:
   mov al,    ;U - load first byte
   and edx,ecx       ;V - trailing byte count

   mov ,al    ;U - write out first byte
   mov al,    ;V - get second byte from source

   shr ecx,2       ;U - shift down to dword count
   mov ,al    ;V - write second byte to destination

   sub esi,2       ;U - point to last src dword
   sub edi,2       ;V - point to last dest dword

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    short CopyUnwindDown ;V - if so, then jump

   std                   ;N - set direction flag
   rep movsd       ;N - move all of our dwords
   cld                   ;N - clear direction flag

   jmp dword ptr TrailDownVec ;N - process trailing bytes

   align @WordSize
LeadDown3:
   mov al,    ;U - load first byte
   and edx,ecx       ;V - trailing byte count

   mov ,al    ;U - write out first byte
   mov al,    ;V - get second byte from source

   mov ,al    ;U - write second byte to destination
   mov al,    ;V - get third byte from source

   shr ecx,2       ;U - shift down to dword count
   mov ,al    ;V - write third byte to destination

   sub esi,3       ;U - point to last src dword
   sub edi,3       ;V - point to last dest dword

   cmp ecx,8       ;U - test if small enough for unwind copy
   jb    CopyUnwindDown;V - if so, then jump

   std                   ;N - set direction flag
   rep movsd       ;N - move all of our dwords
   cld                   ;N - clear direction flag

   jmp dword ptr TrailDownVec ;N - process trailing bytes

;------------------------------------------------------------------

   align @WordSize
UnwindDownVec dd    UnwindDown7, UnwindDown6, UnwindDown5, UnwindDown4
            dd    UnwindDown3, UnwindDown2, UnwindDown1, UnwindDown0

UnwindDown7:
   mov eax, ;U - get dword from source
                              ;V - spare
   mov ,eax ;U - put dword into destination
UnwindDown6:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindDown5:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindDown4:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindDown3:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindDown2:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination
UnwindDown1:
   mov eax, ;U(entry)/V(not) - get dword from source
                              ;V(entry) - spare
   mov ,eax ;U - put dword into destination

   lea eax, ;V - compute update for pointer

   add esi,eax       ;U - update source pointer
   add edi,eax       ;V - update destination pointer
UnwindDown0:
   jmp dword ptr TrailDownVec ;N - process trailing bytes

;-----------------------------------------------------------------------------

   align @WordSize
TrailDownVec dd    TrailDown0, TrailDown1, TrailDown2, TrailDown3

   align @WordSize
TrailDown0:
   mov eax, ;U - return pointer to destination
                           ;V - spare
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

   align @WordSize
TrailDown1:
   mov al,    ;U - get byte from source
                           ;V - spare
   mov ,al    ;U - put byte in destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

   align @WordSize
TrailDown2:
   mov al,    ;U - get first byte from source
                           ;V - spare
   mov ,al    ;U - put first byte into destination
   mov al,    ;V - get second byte from source
   mov ,al    ;U - put second byte into destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

   align @WordSize
TrailDown3:
   mov al,    ;U - get first byte from source
                           ;V - spare
   mov ,al    ;U - put first byte into destination
   mov al,    ;V - get second byte from source
   mov ,al    ;U - put second byte into destination
   mov al,    ;V - get third byte from source
   mov ,al    ;U - put third byte into destination
   mov eax, ;V - return pointer to destination
   pop esi          ;U - restore esi
   pop edi          ;V - restore edi
   M_EXIT

align    16
VEC_memcpy:
   push    edi       ; save dst for returning
   mov       eax, esi
   and       eax, 0Fh
   ; eax = src and dst alignment (src mod 16)
   test    eax, eax
   jne       L_Notaligned

   ; in:
   ;edi = dst (16 byte aligned)
   ;esi = src (16 byte aligned)
   ;ecx = len is >= (128 - head alignment bytes)
   ; do block copy using SSE2 stores
L_Aligned:
   mov       edx, ecx
   and       ecx, 7Fh
   shr       edx, 7
   je       L_1a
   ; ecx = loop count
   ; edx = remaining copy length
align    16
L_1:
   movdqa    xmm0,xmmword ptr
   movdqa    xmm1,xmmword ptr
   movdqa    xmm2,xmmword ptr
   movdqa    xmm3,xmmword ptr
   movdqa    xmmword ptr ,xmm0
   movdqa    xmmword ptr ,xmm1
   movdqa    xmmword ptr ,xmm2
   movdqa    xmmword ptr ,xmm3
   movdqa    xmm4,xmmword ptr
   movdqa    xmm5,xmmword ptr
   movdqa    xmm6,xmmword ptr
   movdqa    xmm7,xmmword ptr
   movdqa    xmmword ptr ,xmm4
   movdqa    xmmword ptr ,xmm5
   movdqa    xmmword ptr ,xmm6
   movdqa    xmmword ptr ,xmm7
   lea       esi,
   lea       edi,
   dec       edx
   jne       L_1
L_1a:
   test    ecx, ecx
   je       L_Return

   ; ecx = length (< 128 bytes)
   mov       edx, ecx
   shr       edx, 4
   test    edx, edx
   je       L_Trailing
   ; if > 16 bytes do a loop (16 bytes at a time)
   ; edx - loop count
   ; edi = dst
   ; esi = src
align 16
L_2:
   movdqa    xmm0, xmmword ptr
   movdqa    xmmword ptr , xmm0
   lea       esi,
   lea       edi,
   dec       edx
   jne       L_2

L_Trailing:

   ; last 1-15 bytes: step back according to dst and src alignment and do a 16-byte copy
   ; esi = src
   ; eax = src alignment(set at the start of the procedure and preserved up to here)
   ; edi = dst
   and       ecx, 0Fh
   ; ecx = remaining len
   je       L_Return

   ; get dword aligned
   mov eax, ecx; save remaining len and calc number of dwords
   shr ecx, 2
   je    L_TrailBytes ; if none try bytes
L_TrailDword:
   mov edx, dword ptr
   mov dword ptr , edx
   lea esi,
   lea edi,
   dec ecx
   jne L_TrailDword
L_TrailBytes:
   mov ecx, eax
   and ecx, 03h
   je    L_Return ; if none return
L_TrailNextByte:
   mov al, byte ptr
   mov byte ptr , al
   inc esi
   inc edi
   dec ecx
   jne L_TrailNextByte
align 16
L_Return:
   ; return dst
   pop eax    ; Get destination for return
   pop esi
   pop edi
   M_EXIT

; dst addr is not 16 byte aligned
align 16
L_Notaligned:

; copy the first the first 1-15 bytes to align both src and dst up to the nearest 16-byte boundary:

; in
; esi = src
; edi = dst
; eax = src and dst alignment
; ecx = length

   mov edx, 010h
   sub edx, eax ; calc num bytes to get it aligned
   sub ecx, edx ; calc new length and save it
   push ecx
   mov eax, edx ; save alignment byte count for dwords
   mov ecx, eax ; set exc to rep count
   and ecx, 03h
   je    L_MovDword ; if no bytes go do dwords
L_Byte:
   mov dl, byte ptr ; move the bytes
   mov byte ptr , dl
   inc esi    ; inc the adrs
   inc edi
   dec ecx    ; dec the counter
   jne L_Byte
L_MovDword:
   shr eax, 2 ; get dword count
   je    L_Adjustcnt ; if none go to main loop
L_Dword:
   mov edx, dword ptr ; move the dwords
   mov dword ptr , edx
   lea esi, ; inc the adrs
   lea edi,
   dec eax       ; dec the counter
   jne L_Dword
L_Adjustcnt:
   pop ecx    ; retrive the adjusted length
   jmp L_Aligned

_MEM_ endp
   end

Ayala 发表于 2017-5-30 23:35:16

本帖最后由 Ayala 于 2017-5-30 23:57 编辑

   page ,132
   title memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
;    Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
;    memcpy() copies a source memory buffer to a destination buffer.
;    Overlapping buffers are not treated specially, so propogation may occur.
;    memmove() copies a source memory buffer to a destination buffer.
;    Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************

include ksamd64.inc
   subttl"memcpy"

;***
;memcpy - Copy source buffer to destination buffer
;
;Purpose:
;    memcpy() copies a source memory buffer to a destination memory buffer.
;    This routine does NOT recognize overlapping buffers, and thus can lead
;    to propogation.
;    For cases where propogation must be avoided, memmove() must be used.
;
;    Algorithm:
;
;    void * memcpy(void * dst, void * src, size_t count)
;    {
;             void * ret = dst;
;
;             /*
;             * copy from lower addresses to higher addresses
;             */
;             while (count--)
;                   *dst++ = *src++;
;
;             return(ret);
;    }
;
;memmove - Copy source buffer to destination buffer
;
;Purpose:
;    memmove() copies a source memory buffer to a destination memory buffer.
;    This routine recognize overlapping buffers to avoid propogation.
;    For cases where propogation is not a problem, memcpy() can be used.
;
; Algorithm:
;
;    void * memmove(void * dst, void * src, size_t count)
;    {
;             void * ret = dst;
;
;             if (dst <= src || dst >= (src + count)) {
;                   /*
;                      * Non-Overlapping Buffers
;                      * copy from lower addresses to higher addresses
;                      */
;                   while (count--)
;                            *dst++ = *src++;
;                   }
;             else {
;                   /*
;                      * Overlapping Buffers
;                      * copy from higher addresses to lower addresses
;                      */
;                   dst += count - 1;
;                   src += count - 1;
;
;                   while (count--)
;                            *dst-- = *src--;
;                   }
;
;             return(ret);
;    }
;
;
;Entry:
;    void *dst = pointer to destination buffer
;    const void *src = pointer to source buffer
;    size_t count = number of bytes to copy
;
;Exit:
;    Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
;    CX, DX
;
;Exceptions:
;*******************************************************************************
   extrn __favor:dword
   extrn __ImageBase:byte
   extrn __memcpy_nt_iters:qword ; defined in cpu_disp.c

__FAVOR_ENFSTRG equ 1

   public memmove

   LEAF_ENTRY_ARG3 memcpy, _TEXT, dst:ptr byte, src:ptr byte, count:dword

   OPTION PROLOGUE:NONE, EPILOGUE:NONE

   memmove = memcpy

   mov r11, rcx             ; save destination address
   mov r10, rdx             ; save source address
   cmp r8, 16                ; if 16 bytes or less
   jbe MoveBytes16          ; go move them quick
   cmp r8, 32                ; check for length <= 32 (we know its > 16)
   jbe Move17to32          ; go handle lengths 17-32 as a special case
   sub rdx, rcx             ; compute offset to source buffer
   jae CopyUp                ; if above or equal, go move up
   mov rax, r10             ; else check that src+count < dst
   add rax, r8             ; src + count
   cmp rcx, rax             ; (src + count) < dst
   jl    CopyDown             ; no, buffers overlap go move downward

CopyUp:
   cmp r8, 128
   jbe XmmCopySmall

   bt    __favor, __FAVOR_ENFSTRG ; check for ENFSTRG (enhanced fast strings)
   jnc XmmCopyUp             ; If Enhanced Fast String not available, use XMM

   ; use Enhanced Fast Strings
   ; but first align the destination dst to 16 byte alignment
   mov rax, r11             ; return original destination pointer
   mov r11, rdi             ; save rdi in r11
   mov rdi, rcx             ; move destination pointer to rdi
   mov rcx, r8             ; move length to rcx
   mov r8, rsi             ; save rsi in r8
   mov rsi, r10             ; move source pointer to rsi
   rep movsb                ; copy source to destination buffer
   mov rsi, r8             ; restore rsi
   mov rdi, r11             ; restore rdi
   ret

; Handle lengths 17-32 as a special case using XMM registers.
; This allows the regular code to assume that there will always be enough
; bytes for the "deferred" block of 16. Also any case that can be handled
; with just two stores is handled with just two stores, the regular code
; will always do 3 stores for unaligned moves that have a remainder.
; No assumptions are made here about buffer alignment or overlap.
; We load the entire string to be moved in 2 xmm registers before storing
; anything, so this works for any arrangement of overlapping buffers.
;
; dst is in rcx (can modify) and r11 (must preserve for return value)
; src is in r10 (should preserve for consistency)
; rdx is the offset from the dst to the source, so rcx + rdx is the src
; r8 is the length, and is known to be 17 <= r8 <= 32
;
; When length < 32 the first 16 bytes includes some of the last 16 bytes
; and we will store (length - 32) bytes twice. (E.g. in the worst case
; of len 17 we are storing the middle 15 bytes of the buffer twice).
; This is still much faster than doing logic and branching with 1, 2, 4
; and 8 byte conditional copies.
;
   align 16

Move17to32:
   movupsxmm0,          ; load first 16 bytes of src
   movupsxmm1, (-16) ; load last 16 bytes of src
   movups, xmm0          ; store first 16 bytes of dst
   movups(-16), xmm1 ; store last 16 bytes of dst
   mov rax, rcx             ; set destination address
   ret

;
; Move residual bytes.
;

   align 16

MoveBytes16:
      mov rax, rcx             ; mov destination address to rax
      lea r9, OFFSET __ImageBase
      mov ecx, [(IMAGERELMoveSmall) + r9 +r8*4]
      add rcx, r9
      jmp rcx

MoveSmall ddIMAGEREL MoveSmall0
      ddIMAGEREL MoveSmall1
      ddIMAGEREL MoveSmall2
      ddIMAGEREL MoveSmall3
      ddIMAGEREL MoveSmall4
      ddIMAGEREL MoveSmall5
      ddIMAGEREL MoveSmall6
      ddIMAGEREL MoveSmall7
      ddIMAGEREL MoveSmall8
      ddIMAGEREL MoveSmall9
      ddIMAGEREL MoveSmall10
      ddIMAGEREL MoveSmall11
      ddIMAGEREL MoveSmall12
      ddIMAGEREL MoveSmall13
      ddIMAGEREL MoveSmall14
      ddIMAGEREL MoveSmall15
      ddIMAGEREL MoveSmall16

   align 16

MoveSmall0::
   ret

MoveSmall2::
   movzx ecx, word ptr ; get two byte from source
   mov , cx             ; write two bytes to destination
   ret

MoveSmall8::
   mov rcx, qword ptr ; get eight bytes from source
   mov , rcx          ; write eight bytes to destination
   ret

MoveSmall3::
   movzx ecx, word ptr ; get two bytes from source
   movzx r8d, byte ptr 2 ; get last byte from source
   mov , cx             ; write two bytes to destination
   mov 2, r8b          ; write last byte to destination
   ret

MoveSmall1::
   movzx ecx, byte ptr ; get byte from source
   mov , cl             ; write byte to destination
   ret

MoveSmall16::
   movdquxmm0, xmmword ptr ; get sixteen bytes from source
   movdquxmmword ptr , xmm0 ; write sixteen bytes to destination
   ret

   align 16
MoveSmall11::
   mov r8, qword ptr ; get eight bytes from source
   movzx ecx, word ptr 8 ; get two bytes from source
   movzx r9d, byte ptr 10 ; get last byte from source
   mov , r8             ; write eight bytes to destination
   mov 8, cx          ; write two bytes to destination
   mov 10, r9b          ; write last byte to destination
   mov rcx, r11             ; set destination address
   ret

MoveSmall4::
   mov ecx, dword ptr ; get four bytes from source
   mov , ecx          ; write four bytes to destination
   ret

   align 16
MoveSmall5::
   mov ecx, dword ptr ; get four bytes from source
   movzx r8d, byte ptr 4 ; get last byte from source
   mov , ecx          ; write four bytes to destination
   mov 4, r8b          ; write last byte to destination
   ret

   align 16
MoveSmall6::
   mov ecx, dword ptr ; get four bytes from source
   movzx r8d, word ptr 4 ; get two bytes from source
   mov , ecx          ; write four bytes to destination
   mov 4, r8w          ; write two bytes to destination
   ret

   align 16
MoveSmall7::
   mov ecx, dword ptr ; get four bytes from source
   movzx r8d, word ptr 4 ; get two bytes from source
   movzx r9d, byte ptr 6 ; get last byte from source
   mov , ecx          ; write four bytes to destination
   mov 4, r8w          ; write two bytes to destination
   mov 6, r9b          ; write last byte to destination
   ret

MoveSmall13::
   mov r8, qword ptr ; get eight bytes from source
   mov ecx, dword ptr 8 ; get four bytes from source
   movzx r9d, byte ptr 12 ; get last bytes from source
   mov , r8             ; write eight bytes to destination
   mov 8, ecx          ; write four bytesto destination
   mov 12, r9b          ; write last byte to destination
   ret

   align 16
MoveSmall9::
   mov r8, qword ptr ; get eight bytes from source
   movzx ecx, byte ptr 8 ; get last byte from source
   mov , r8             ; write eight bytes to destination
   mov 8, cl          ; write last byte to destination
   ret

   align 16
MoveSmall10::
   mov r8, qword ptr ; get eight bytes from source
   movzx ecx, word ptr 8 ; get two bytes from source
   mov , r8             ; write eight bytes to destination
   mov 8, cx          ; write two bytes to destination
   ret

   align 16
MoveSmall12::
   mov r8, qword ptr ; get eight bytes from source
   mov ecx, dword ptr 8 ; get four bytes from source
   mov , r8             ; write eight bytes to destination
   mov 8, ecx          ; write four bytes to destination
   ret

   align 16
MoveSmall14::
   mov r8, qword ptr ; get eight bytes from source
   mov ecx, dword ptr 8 ; get four bytes from source
   movzx r9d, word ptr 12 ; get two bytes from source
   mov , r8             ; write eight bytes to destination
   mov 8, ecx          ; write four bytes to destination
   mov 12, r9w          ; write two bytes to destination
   ret

   align 16
MoveSmall15::
   mov r8, qword ptr ; get eight bytes from source
   mov ecx, dword ptr 8 ; get four bytes from source
   movzx r9d, word ptr 12 ; get two bytes from source
   movzx r10d, byte ptr 14; get last byte from source
   mov , r8             ; write eight bytes to destination
   mov 8, ecx          ; write four bytes to destination
   mov 12, r9w          ; write two bytes to destination
   mov 14, r10b       ; write last byte to destination
   ret

;
; Memcpy up using SSE instructions.
;
; Preconditions:
;    destination in rcx (destructable) and r11 (must preserve for return value)
;    source in r10
;    length in r8, must be greater than 16
;    offset from dest to src in rdx
;    source addr > dest addr or else buffers don't overlap
;
; Aligned stores are much faster on AMD hardware, so start by moving however many
; bytes must be moved so updated dst is 16-byte aligned. We need to copy
; (16 - (dest mod 16)) bytes, but it's faster to just do an unaligned copy of 16
; bytes and then start the aligned loop as usual at ((dest - (dest mod 16)) + 16).
; This results in (dest mod 16) bytes being copied twice. This is a lot faster
; than a bunch of code to copy maybe 1 then maybe 2 then maybe 4 then maybe 8
; bytes to achieve dst alignement.
;
; We know the src address is greater than the dst, but not by how much. In the
; case where the difference is less than 16 we must be careful about the bytes
; that will be stored twice. We must do both loads before either store, or the
; second load of those bytes will get the wrong values. We handle this by
; loading the last 16 bytes that can be stored at an aligned address, but
; deferring the store of those bytes to the remainder code, so it can load the
; remainder before storing the deferred bytes. Since either or both of the two
; loops can be skipped, the preconditions needed by the remaindercode must
; also apply to the loops. These conditions are:
;- r8 is the count remaining, not including the deferred bytes
;- and as usual point to the src and dst where the number
; number of bytes given by r8 should be copied from and to.
;- xmm0 holds the 16 deferred bytes that need to be stored at (-16)
;
   align 16
XmmCopyUp:
   movupsxmm0,    ; load deferred bytes
   add r8, rcx             ; r8 points 1 byte past end
   add rcx, 16             ; update to next block.
   test r11b, 15             ; test if destination aligned
   jz    XmmCopyLargeTest    ; go try 128-byte blocks
;
; Move alignment bytes.
;
XmmCopyAlign:
   movapsxmm1, xmm0          ; save initial bytes in xmm1
   and rcx, -16             ; rcx is 16 bytes past first 16-byte align point
   movupsxmm0,    ; load aligned deferred-store bytes
   add rcx, 16             ; update to next block
   movups, xmm1          ; now safe to store 16 unaligned at start
;
; See if we can move any 128-byte blocks.
;
XmmCopyLargeTest:
   sub r8, rcx             ; r8 restored to count remaining
   mov r9, r8                ; copy count of bytes remaining
   shr r9, 7                ; compute number of 128-byte blocks
   jz    XmmCopySmallTest    ; if z jump around to 2nd loop
   movaps(-16), xmm0    ; going into 1st loop, ok to store deferred bytes
   cmp r9, __memcpy_nt_iters ; threshold defined by cpu_disp.c
   jna short XmmCopyLargeInner ; jump into 1st loop
   jmp XmmCopyLargeInnerNT ; long enough so non-temporal worth it, jump into nt loop

;
; Move 128-byte blocks
;
   align 16
;
; When possible, non-mov instructions are put between a load and store
; so their execution can overlap the store.
; The jnz is likewise moved earlier to come before the last store pair.
; Pairs of loads/stores are used to overlap cache latencies.
; movups and movaps are equally fast on aligned storage, we use movaps
; to document movs that we *know* are going to be aligned, movups otherwise.
; xmm0 must be preloaded before jumping into this loop, and the last
; store must be deferred (and the bytes to store left in xmm0) for the
; following loop and/or the remainder code.
;
XmmCopyLargeOuter:
   movaps(-32), xmm0    ; store 7th chunk from prior iteration
   movaps(-16), xmm1    ; store 8th chunk from prior iteration
XmmCopyLargeInner:                   ; enter loop here with xmm0 preloaded.
   movupsxmm0,    ; load first 16 byte chunk
   movupsxmm1, 16 ; load 2nd 16 byte chunk
   add rcx, 128             ; advance destination address
   movaps(-128), xmm0    ; store first 16 byte chunk
   movaps(-112), xmm1    ; store 2nd 16 byte chunk
   movupsxmm0, (-96); load 3rd chunk
   movupsxmm1, (-80); load 4th chunk
   dec r9                   ; dec block counter (set cc for jnz)
   movaps(-96), xmm0    ; store 3rd chunk
   movaps(-80), xmm1    ; store 4th chunk
   movupsxmm0, (-64); load 5th chunk
   movupsxmm1, (-48); load 6th chunk
   movaps(-64), xmm0    ; store 5th chunk
   movaps(-48), xmm1    ; store 6th chunk
   movupsxmm0, (-32); load 7th chunk
   movupsxmm1, (-16); load 8th chunk
   jnz XmmCopyLargeOuter    ; loop if more blocks

XmmCopyFinish:                      ; non-temporal codepath rejoins here
   movaps(-32), xmm0    ; store 7th chunk from final iteration
   and r8, 127             ; compute remaining byte count
   movapsxmm0, xmm1          ; 8th chunk becomes deferred bytes
   jmp XmmCopySmallTest

XmmCopySmall:
   movupsxmm0,    ; load deferred bytes
   add rcx, 16
   sub r8, 16
;
; See if we have any 16-byte blocks left to move
;
XmmCopySmallTest:
   mov r9, r8                ; copy count of bytes remaining
   shr r9, 4                ; compute number of 16-byte blocks
   jz    short XmmCopyTrail    ; on z, no 16-byte blocks, skip 2nd loop

   align 16

XmmCopySmallLoop:
   movups(-16), xmm0    ; the first time through this is the
                                    ; store of the deferred bytes from above
   movupsxmm0,    ; load a block
   add rcx, 16             ; advance dest addr (store is deferred)
   dec r9
   jnz XmmCopySmallLoop

XmmCopyTrail:
   and r8, 15                ; compute remaining byte count
   jz    short XmmCopyReturn ; if z, no remainder bytes to move
;
; Handle remainder bytes.
;
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write
; some bytes.We must not touch rcx or xmm0 because they have what we need to store the
; deferred block. We use rax to point to the first byte after the end of the buffer and
; back up from there. Note rax is pointing to an address we must not read or write!
;
   lea rax,       ; make rax point one past the end
   movupsxmm1, (-16); load last 16 bytes of source buffer
   movups(-16), xmm1    ; write last 16 bytes, including 16-r8 bytes
                                    ; from the last aligned block which we are about to
                                    ; overstore with identical values
XmmCopyReturn:
   movups(-16), xmm0    ; store the last deferred aligned block
   mov rax, r11             ; we must return the original destination address
   ret                         ;

;
; Move 128-byte blocks non-temporal
;
   align 16
;
; non-temporal is exactly the same as the regular xmm loop above, except the movaps
; stores are movntps and we use prefetchnta. We are prefetching in two places, each
; prefetch gets 64 bytes about half an iteration ahead of time (about 10 instructions
; lead time). When we come to the end of the memcpy, we'll be prefetching bytes
; beyond the buffer we need to copy from, which may not be valid bytes. This is
; not illegal; if the memory address is invalid it does not trap, the hardware treats
; illegal prefetches as nops.
;

XmmCopyLargeOuterNT:
   movntps (-32), xmm0    ; store 7th chunk from prior iteration
   movntps (-16), xmm1    ; store 8th chunk from prior iteration
XmmCopyLargeInnerNT:                ; enter loop here with xmm0 preloaded.
   prefetchnta ; prefetch several cache lines ahead
   movupsxmm0,    ; load first 16 byte chunk
   movupsxmm1, 16 ; load 2nd 16 byte chunk
   add rcx, 128             ; advance destination address
   movntps (-128), xmm0    ; store first 16 byte chunk
   movntps (-112), xmm1    ; store 2nd 16 byte chunk
   movupsxmm0, (-96); load 3rd chunk
   movupsxmm1, (-80); load 4th chunk
   dec r9                   ; dec block counter (set cc for jnz)
   movntps (-96), xmm0    ; store 3rd chunk
   movntps (-80), xmm1    ; store 4th chunk
   movupsxmm0, (-64); load 5th chunk
   movupsxmm1, (-48); load 6th chunk
   prefetchnta ; prefetch several cache lines ahead
   movntps (-64), xmm0    ; store 5th chunk
   movntps (-48), xmm1    ; store 6th chunk
   movupsxmm0, (-32); load 7th chunk
   movupsxmm1, (-16); load 8th chunk
   jnz XmmCopyLargeOuterNT ; loop if more blocks

   sfence
   jmp XmmCopyFinish       ; rejoin regular memcpy codepath

;
; The source address is less than the destination address.
;

   align 16
;
; Move bytes down using SSE registers. The source address is less than
; the destination address and the buffers overlap. We will do everything back-to-front.
;
; Preconditions:
;    destination is r11 (must preserve for return value) and rcx
;    source in r10 (must preserve for remainder move)
;    length in r8, must have been verified to be greater than 16
;    offset from dest to src in rdx
;    source addr < dest addr and the buffers overlap
;
CopyDown:
   add rcx, r8             ; make rcx point one past the end of the dst buffer
   movupsxmm0, -16 ; load deferred bytes
   sub rcx, 16             ; reduce dst addr
   sub r8, 16                ; r8 -= 16 in case aligned

;
; Aligned stores using movaps or movups are faster on AMD hardware than unaligned
; stores using movups. To achieve 16-byte dest alignment, we do an unaligned move
; of the last 16 bytes of the buffers, then reduce rcx only by the amount necessary
; to achieve alignment. This results in some bytes getting copied twice, unless we're
; already aligned.
;
; We know the src address is less than the dst, but not by exactly how much. In the
; case where the difference is less than 16 we must be careful about the bytes
; that will be stored twice. We must do both loads before either store, or the
; second load of those bytes will get the wrong values. We handle this by
; deferring the store of 16 aligned bytes to the remainder code, so it can load the
; remainder before storing the deferred bytes. Since either or both of the two
; loops can be skipped, the preconditions needed by the remaindercode must
; also apply to the loops. These conditions are:
;- r8 is the count remaining, not including the deferred bytes
;- points one past the end of the remainder bytes
;- rdx is the offset from the dst to the source
;- xmm0 holds the 16 deferred bytes that need to be stored at
;
   test cl, 15                ; test if dest aligned
   jz    XmmMovLargeTest       ; go try 128-byte blocks
;
; Move alignment bytes.
;
XmmMovAlign:
   mov rax, rcx             ; save unaligned store address
   and rcx, -16             ; rcx is deferred store address
   movupsxmm1, xmm0          ; copy unaligned last bytes to xmm1
   movupsxmm0,    ; load deferred-store bytes
   movups, xmm1          ; now safe to do unaligned store
   mov r8, rcx             ; easier to recalc r8 using rcx-r11 ...
   sub r8, r11             ; ... than calc how much to subtract from r8

;
; See if we can move any 128-byte blocks.
;
XmmMovLargeTest:
   mov r9, r8                ; copy count of bytes remaining
   shr r9, 7                ; compute number of 128-byte blocks
   jz    short XmmMovSmallTest ; if z jump around to 2nd loop
   movaps, xmm0          ; going into 1st loop, ok to store deferred bytes
   jmp short XmmMovLargeInner; jump into 1st loop
;
; Move 128-byte blocks
;
   align 16

XmmMovLargeOuter:
   movaps(128-112), xmm0 ; store 7th chunk from prior iteration
   movaps(128-128), xmm1 ; store 8th chunk from prior iteration
XmmMovLargeInner:
   movupsxmm0, (-16)    ; load first 16 byte chunk
   movupsxmm1, (-32)    ; load 2nd 16 byte chunk
   sub rcx, 128                ; reduce destination address
   movaps(128-16), xmm0       ; store first 16 byte chunk
   movaps(128-32), xmm1       ; store 2nd 16 byte chunk
   movupsxmm0, (128-48) ; load 3rd chunk
   movupsxmm1, (128-64) ; load 4th chunk
   dec r9                      ; dec block counter (set cc for jnz)
   movaps(128-48), xmm0       ; store 3rd chunk
   movaps(128-64), xmm1       ; store 4th chunk
   movupsxmm0, (128-80) ; load 5th chunk
   movupsxmm1, (128-96) ; load 6th chunk
   movaps(128-80), xmm0       ; store 5th chunk
   movaps(128-96), xmm1       ; store 6th chunk
   movupsxmm0, (128-112); load 7th chunk
   movupsxmm1, (128-128); load 8th chunk
   jnz short XmmMovLargeOuter    ; loop if more blocks

   movaps(128-112), xmm0 ; store 7th chunk from final iteration
   and r8, 127             ; compute remaining byte count
   movapsxmm0, xmm1          ; 8th chunk becomes deferred bytes
;
; See if we have any 16-byte blocks left to move
;
XmmMovSmallTest:
   mov r9, r8                ; copy count of bytes remaining
   shr r9, 4                ; compute number of 16-byte blocks
   jz    short XmmMovTrailing ; if z, no 16-byte blocks

   align 16

XmmMovSmallLoop:
   movups, xmm0          ; the first time through this is the
                                    ; store of the deferred bytes from above
   sub rcx, 16             ; reduce dest addr
   movupsxmm0,    ; load a block
   dec r9
   jnz XmmMovSmallLoop

XmmMovTrailing:
   and r8, 15                ; compute remaining byte count
   jz    short XmmMovReturn    ; if z, no residual bytes to move

;
; Handle remainder bytes.
;
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write
; some bytes.We must not touch rcx or xmm0 because they have what we need to store the
; deferred block. But unlike for mcpyxmm code above, we have r10 and r11 we can just use
; to copy the lowest 16 bytes.
;
   movupsxmm1,          ; load lowest 16 bytes, which includes remainder
   movups, xmm1          ; store lowest 16 bytes, which includes remainder

XmmMovReturn:
   movups, xmm0          ; store deferred bytes
   mov rax, r11             ; we must return destination address
   ret

   LEAF_END memcpy, _TEXT

   end

Ayala 发表于 2017-5-30 23:36:28

/***
*memcpy.c - contains memcpy routine
*
*    Copyright (c) Microsoft Corporation. All rights reserved.
*
*Purpose:
*    memcpy() copies a source memory buffer to a destination buffer.
*    Overlapping buffers are not treated specially, so propogation may occur.
*
*******************************************************************************/

#include <cruntime.h>
#include <string.h>

#pragma function(memcpy)

/***
*memcpy - Copy source buffer to destination buffer
*
*Purpose:
*    memcpy() copies a source memory buffer to a destination memory buffer.
*    This routine does NOT recognize overlapping buffers, and thus can lead
*    to propogation.
*
*    For cases where propogation must be avoided, memmove() must be used.
*
*Entry:
*    void *dst = pointer to destination buffer
*    const void *src = pointer to source buffer
*    size_t count = number of bytes to copy
*
*Exit:
*    Returns a pointer to the destination buffer
*
*Exceptions:
*******************************************************************************/

void * __cdecl memcpy (
   void * dst,
   const void * src,
   size_t count
   )
{
   void * ret = dst;

   /*
      * copy from lower addresses to higher addresses
      */
   while (count--) {
            *(char *)dst = *(char *)src;
            dst = (char *)dst + 1;
            src = (char *)src + 1;
   }

   return(ret);
}

0xAA55 发表于 2017-6-13 03:00:07

其实在远古时代，DOS的年代，有人玩内存到内存DMA呢，不过据说都不是很成功。

唐凌发表于 2018-7-27 10:05:42

不判断内存重叠的话还不如用这货：
https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/hhta9830(v%3dvs.100)

0xAA55 发表于 2018-7-27 22:35:03

tangptr@126.com 发表于 2018-7-27 10:05
不判断内存重叠的话还不如用这货：
https://docs.microsoft.com/en-us/previous-versions/visualstudio/vis ...

然而非atom架构的机器上这货烂得一匹

唐凌发表于 2018-7-28 00:12:09

0xAA55 发表于 2018-7-27 22:35
然而非atom架构的机器上这货烂得一匹

mov_char:
   ... ...
   mov edi,
   mov esi,
   mov ecx,
   rep movsb
   ... ...
   ret
总比
move_char:
push ebp
mov ebp, esp
sub esp, 0x0c
mov eax,
mov edi, eax
mov esi,
mov ecx, dword ptr

move_loop:
mov bl, byte ptr
mov byte ptr , bl
inc esi
inc edi
dec ecx
jnz move_loop
　　　　
mov esp, ebp
pop ebp
ret
要好。

3239066163 发表于 2018-8-6 09:21:22

emm这一堆代码真的长以后潜伏在这个论坛了

watermelon 发表于 2018-9-1 10:50:09

小弟目前还在学一些简单的汇编，有些指令看得懂，有些指令看不懂（大部分），但是无疑发现memcpy有700多行的asm程序是多么兴奋的事情。

页: [1]

技术宅的结界's Archiver

【C】memcpy真的就只是复制内存吗？然而并没有那么简单。