1 /** 
2  Identify the characteristics of the host CPU, providing information
3  about cache sizes and assembly optimisation hints.
4 
5  Some of this information was extremely difficult to track down. Some of the
6  documents below were found only in cached versions stored by search engines!
7   This code relies on information found in:
8 
9 $(UL
10   $(LI "Intel(R) 64 and IA-32 Architectures Software Developers Manual,
11       Volume 2A: Instruction Set Reference, A-M" (2007).)
12   $(LI "AMD CPUID Specification", Advanced Micro Devices, Rev 2.28 (2008).)
13   $(LI "AMD Processor Recognition Application Note For Processors Prior to AMD
14       Family 0Fh Processors", Advanced Micro Devices, Rev 3.13 (2005).)
15   $(LI "AMD Geode(TM) GX Processors Data Book",
16       Advanced Micro Devices, Publication ID 31505E, (2005).)
17   $(LI "AMD K6 Processor Code Optimisation", Advanced Micro Devices, Rev D (2000).)
18   $(LI "Application note 106: Software Customization for the 6x86 Family",
19       Cyrix Corporation, Rev 1.5 (1998))
20   $(LI $(LINK http://ftp.intron.ac/pub/document/cpu/cpuid.htm))
21   $(LI "Geode(TM) GX1 Processor Series Low Power Integrated X86 Solution",
22       National Semiconductor, (2002))
23   $(LI "The VIA Isaiah Architecture", G. Glenn Henry, Centaur Technology, Inc (2008).)
24   $(LI $(LINK http://www.sandpile.org/ia32/cpuid.htm))
25   $(LI $(LINK http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html))
26   $(LI "What every programmer should know about memory",
27      Ulrich Depper, Red Hat, Inc., (2007).)
28   $(LI "CPU Identification by the Windows Kernel", G. Chappell (2009).
29     $(LINK http://www.geoffchappell.com/viewer.htm?doc=studies/windows/km/cpu/cx8.htm))
30   $(LI "Intel(R) Processor Identification and the CPUID Instruction, Application
31      Note 485" (2009).)
32 )
33 
34 AUTHORS:  Don Clugston,
35           Tomas Lindquist Olsen $(EMAIL tomas@famolsen.dk)
36 COPYRIGHT:  Public Domain
37 
38 BUGS:   Currently only works on x86 CPUs.
39         Many processors have bugs in their microcode for the CPUID instruction,
40         so sometimes the cache information may be incorrect.
41 */
42 
43 module tango.core.tools.Cpuid;
44 
45 // If optimizing for a particular processor, it is generally better
46 // to identify based on features rather than model. NOTE: Normally
47 // it's only worthwhile to optimise for the latest Intel and AMD CPU,
48 // with a backup for other CPUs.
49 // Pentium    -- preferPentium1()
50 // PMMX       --   + mmx()
51 // PPro       -- default
52 // PII        --   + mmx()
53 // PIII       --   + mmx() + sse()
54 // PentiumM   --   + mmx() + sse() + sse2()
55 // Pentium4   -- preferPentium4()
56 // PentiumD   --   + isX86_64()
57 // Core2      -- default + isX86_64()
58 // AMD K5     -- preferPentium1()
59 // AMD K6     --   + mmx()
60 // AMD K6-II  --   + mmx() + 3dnow()
61 // AMD K7     -- preferAthlon()
62 // AMD K8     --   + sse2()
63 // AMD K10    --   + isX86_64()
64 // Cyrix 6x86 -- preferPentium1()
65 //    6x86MX  --   + mmx()
66 
67 public:
68 
69 /// Cache size and behaviour.
70 struct CacheInfo
71 {
72     /// Size of the cache, in kilobytes, per CPU.
73     /// For L1 unified (data + code) caches, this size is half the physical size.
74     /// (we don't halve it for larger sizes, since normally
75     /// data size is much greater than code size for critical loops).
76     uint size;
77     /// Number of ways of associativity, eg:$(BR)
78     /// 1 = direct mapped$(BR)
79     /// 2 = 2-way set associative$(BR)
80     /// 3 = 3-way set associative$(BR)
81     /// ubyte.max = fully associative
82     ubyte associativity;
83     /// Number of bytes read into the cache when a cache miss occurs.
84     uint lineSize;
85 }
86 
87 public:
88     /// Returns vendor string, for display purposes only.
89     /// Do NOT use this to determine features!
90     /// Note that some CPUs have programmable vendorIDs.
91     const(char)[] vendor()     {return cast(const(char)[]) vendorID;}
92     /// Returns processor string, for display purposes only
93     const(char)[] processor()  {return processorName;}    
94     
95     /// The data caches. If there are fewer than 5 physical caches levels,
96     /// the remaining levels are set to uint.max (== entire memory space)
97     CacheInfo[5] datacache;
98     /// Does it have an x87 FPU on-chip?
99     @property bool x87onChip()    {return (features&FPU_BIT)!=0;}
100     /// Is MMX supported?
101     @property bool mmx()          {return (features&MMX_BIT)!=0;}
102     /// Is SSE supported?
103     @property bool sse()          {return (features&SSE_BIT)!=0;}
104     /// Is SSE2 supported?
105     @property bool sse2()         {return (features&SSE2_BIT)!=0;}
106     /// Is SSE3 supported?
107     @property bool sse3()         {return (miscfeatures&SSE3_BIT)!=0;}
108     /// Is SSSE3 supported?
109     @property bool ssse3()        {return (miscfeatures&SSSE3_BIT)!=0;}
110     /// Is SSE4.1 supported?
111     @property bool sse41()        {return (miscfeatures&SSE41_BIT)!=0;}
112     /// Is SSE4.2 supported?
113     @property bool sse42()        {return (miscfeatures&SSE42_BIT)!=0;}
114     /// Is SSE4a supported?
115     @property bool sse4a()        {return (amdmiscfeatures&SSE4A_BIT)!=0;}
116     /// Is AMD 3DNOW supported?
117     @property bool amd3dnow()     {return (amdfeatures&AMD_3DNOW_BIT)!=0;}
118     /// Is AMD 3DNOW Ext supported?
119     @property bool amd3dnowExt()  {return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;}
120     /// Are AMD extensions to MMX supported?
121     @property bool amdMmx()       {return (amdfeatures&AMD_MMX_BIT)!=0;}
122     /// Is fxsave/fxrstor supported?
123     @property bool hasFxsr()          {return (features&FXSR_BIT)!=0;}
124     /// Is cmov supported?
125     @property bool hasCmov()          {return (features&CMOV_BIT)!=0;}
126     /// Is rdtsc supported?
127     @property bool hasRdtsc()         {return (features&TIMESTAMP_BIT)!=0;}
128     /// Is cmpxchg8b supported?
129     @property bool hasCmpxchg8b()     {return (features&CMPXCHG8B_BIT)!=0;}
130     /// Is cmpxchg8b supported?
131     @property bool hasCmpxchg16b()    {return (miscfeatures&CMPXCHG16B_BIT)!=0;}
132     /// Is SYSENTER/SYSEXIT supported?
133     @property bool hasSysEnterSysExit()     {
134         // The SYSENTER/SYSEXIT features were buggy on Pentium Pro and early PentiumII.
135         // (REF: www.geoffchappell.com).
136         if (probablyIntel && (family < 6 || (family==6 && (model< 3 || (model==3 && stepping<3)))))
137             return false;
138         return (features & SYSENTERSYSEXIT_BIT)!=0;
139     }
140     
141     /// Is 3DNow prefetch supported?
142     @property bool has3dnowPrefetch()
143         {return (amdmiscfeatures&AMD_3DNOW_PREFETCH_BIT)!=0;}
144     /// Are LAHF and SAHF supported in 64-bit mode?
145     @property bool hasLahfSahf()          {return (amdmiscfeatures&LAHFSAHF_BIT)!=0;}
146     /// Is POPCNT supported?
147     @property bool hasPopcnt()        {return (miscfeatures&POPCNT_BIT)!=0;}    
148     /// Is LZCNT supported?
149     @property bool hasLzcnt()         {return (amdmiscfeatures&LZCNT_BIT)!=0;}
150     /// Is this an Intel64 or AMD 64?
151     @property bool isX86_64()         {return (amdfeatures&AMD64_BIT)!=0;}
152             
153     /// Is this an IA64 (Itanium) processor?
154     @property bool isItanium()        { return (features&IA64_BIT)!=0; }
155 
156     /// Is hyperthreading supported?
157     @property bool hyperThreading()   { return maxThreads>maxCores; }
158     /// Returns number of threads per CPU
159     @property uint threadsPerCPU()    {return maxThreads;}
160     /// Returns number of cores in CPU
161     @property uint coresPerCPU()      {return maxCores;}
162     
163     /// Optimisation hints for assembly code.
164     /// For forward compatibility, the CPU is compared against different
165     /// microarchitectures. For 32-bit X86, comparisons are made against
166     /// the Intel PPro/PII/PIII/PM family.
167     ///
168     /// The major 32-bit x86 microarchitecture 'dynasties' have been:
169     /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2).
170     /// (2) AMD Athlon (K7, K8, K10).
171     /// (3) Intel NetBurst (Pentium 4, Pentium D).
172     /// (4) In-order Pentium (Pentium1, PMMX, Atom)
173     /// Other early CPUs (Nx586, AMD K5, K6, Centaur C3, Transmeta,
174     ///   Cyrix, Rise) were mostly in-order.
175     /// Some new processors do not fit into the existing categories:
176     /// Intel Atom 230/330 (family 6, model 0x1C) is an in-order core.
177     /// Centaur Isiah = VIA Nano (family 6, model F) is an out-of-order core.
178     ///
179     /// Within each dynasty, the optimisation techniques are largely
180     /// identical (eg, use instruction pairing for group 4). Major
181     /// instruction set improvements occur within each dynasty.
182     
183     /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code?
184     @property bool preferAthlon() { return probablyAMD && family >=6; }
185     /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code?
186     @property bool preferPentium4() { return probablyIntel && family == 0xF; }
187     /// Does this CPU perform better on Pentium I code than Pentium Pro code?
188     @property bool preferPentium1() { return family < 6 || (family==6 && model < 0xF && !probablyIntel); }
189 
190 public:
191     /// Processor type (vendor-dependent).
192     /// This should be visible ONLY for display purposes.
193     uint stepping, model, family;
194     uint numCacheLevels = 1;
195 private:
196     bool probablyIntel; // true = _probably_ an Intel processor, might be faking
197     bool probablyAMD; // true = _probably_ an AMD processor
198     char[12] vendorID;
199     string processorName;
200     char[48] processorNameBuffer;
201     uint features = 0;     // mmx, sse, sse2, hyperthreading, etc
202     uint miscfeatures = 0; // sse3, etc.
203     uint amdfeatures = 0;  // 3DNow!, mmxext, etc
204     uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc
205     uint maxCores = 1;
206     uint maxThreads = 1;
207     // Note that this may indicate multi-core rather than hyperthreading.
208     bool hyperThreadingBit()    { return (features&HTT_BIT)!=0;}
209     
210     // feature flags CPUID1_EDX
211     enum : uint
212     {
213         FPU_BIT = 1,
214         TIMESTAMP_BIT = 1<<4, // rdtsc
215         MDSR_BIT = 1<<5,      // RDMSR/WRMSR
216         CMPXCHG8B_BIT = 1<<8,
217         SYSENTERSYSEXIT_BIT = 1<<11,
218         CMOV_BIT = 1<<15,
219         MMX_BIT = 1<<23,
220         FXSR_BIT = 1<<24,
221         SSE_BIT = 1<<25,
222         SSE2_BIT = 1<<26,
223         HTT_BIT = 1<<28,
224         IA64_BIT = 1<<30
225     }
226     // feature flags misc CPUID1_ECX
227     enum : uint
228     {
229         SSE3_BIT = 1,
230         PCLMULQDQ_BIT = 1<<1, // from AVX
231         MWAIT_BIT = 1<<3,
232         SSSE3_BIT = 1<<9,
233         FMA_BIT = 1<<12,     // from AVX
234         CMPXCHG16B_BIT = 1<<13,
235         SSE41_BIT = 1<<19,
236         SSE42_BIT = 1<<20,
237         POPCNT_BIT = 1<<23,
238         AES_BIT = 1<<25, // AES instructions from AVX
239         OSXSAVE_BIT = 1<<27, // Used for AVX
240         AVX_BIT = 1<<28
241     }
242 /+    
243 version(X86_64) {    
244     bool hasAVXinHardware() {
245         // This only indicates hardware support, not OS support.
246         return (miscfeatures&AVX_BIT) && (miscfeatures&OSXSAVE_BIT);
247     }
248     // Is AVX supported (in both hardware & OS)?
249     bool Avx() {
250         if (!hasAVXinHardware()) return false;
251         // Check for OS support
252         uint xfeatures;
253         asm {mov ECX, 0; xgetbv; mov xfeatures, EAX; }
254         return (xfeatures&0x6)==6;
255     }
256     bool hasAvxFma() {
257         if (!AVX()) return false;
258         return (features&FMA_BIT)!=0;        
259     }
260 }
261 +/    
262     // AMD feature flags CPUID80000001_EDX
263     enum : uint
264     {
265         AMD_MMX_BIT = 1<<22,
266 //      FXR_OR_CYRIXMMX_BIT = 1<<24, // Cyrix/NS: 6x86MMX instructions. 
267         FFXSR_BIT = 1<<25,
268         PAGE1GB_BIT = 1<<26, // support for 1GB pages
269         RDTSCP_BIT = 1<<27,
270         AMD64_BIT = 1<<29,
271         AMD_3DNOW_EXT_BIT = 1<<30,
272         AMD_3DNOW_BIT = 1<<31
273     }
274     // AMD misc feature flags CPUID80000001_ECX
275     enum : uint
276     {
277         LAHFSAHF_BIT = 1,
278         LZCNT_BIT = 1<<5,
279         SSE4A_BIT = 1<<6,       
280         AMD_3DNOW_PREFETCH_BIT = 1<<8
281     }
282 
283 version(GNU){
284     // GDC is a filthy liar. It can't actually do inline asm.
285 } else version(D_InlineAsm_X86) {
286     version = Really_D_InlineAsm_X86;
287 }
288 
289 version(Really_D_InlineAsm_X86) {
290 // Note that this code will also work for Itanium in x86 mode.
291 
292 uint max_cpuid, max_extended_cpuid;
293 
294 // CPUID2: "cache and tlb information"
295 void getcacheinfoCPUID2()
296 {
297     // CPUID2 is a dog's breakfast. What was Intel thinking???
298     // We are only interested in the data caches
299     void decipherCpuid2(ubyte x) {
300         if (x==0) return;
301         // Values from http://www.sandpile.org/ia32/cpuid.htm.
302         // Includes Itanium and non-Intel CPUs.
303         //
304         static ubyte[63] ids = [
305             0x0A, 0x0C, 0x0D, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68,
306             // level 2 cache
307             0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F,
308             0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E,
309             0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81,
310             // level 3 cache
311             0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D,
312             
313             0xD0, 0xD1, 0xD2, 0xD6, 0xD7, 0xD8, 0xDC, 0xDD, 0xDE,
314             0xE2, 0xE3, 0xE4, 0xEA, 0xEB, 0xEC
315         ];
316         static uint[63] sizes = [
317             8, 16, 16, 64, 16, 24, 8, 16, 32,
318             128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512,
319             256, 512, 1024, 2048, 512, 1024, 4096, 6*1024,
320             128, 192, 128, 256, 384, 512, 3072, 512, 128,           
321             512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024,
322             
323             512, 1024, 2048, 1024, 2048, 4096, 1024+512, 3*1024, 6*1024,
324             2*1024, 4*1024, 8*1024, 12*1024, 28*1024, 24*1024
325         ];
326     // CPUBUG: Pentium M reports 0x2C but tests show it is only 4-way associative
327         static ubyte[63] ways = [
328             2, 4, 4, 8, 8, 6, 4, 4, 4,
329             4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2,
330             8, 8, 8, 8, 4, 8, 16, 24,
331             4, 6, 2, 4, 6, 4, 12, 8, 8,
332             4, 8, 8, 8, 4, 8, 12, 16, 12, 16,
333             4, 4, 4, 8, 8, 8, 12, 12, 12,
334             16, 16, 16, 24, 24, 24            
335         ];
336         enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 }
337         for (int i=0; i< ids.length; ++i) {
338             if (x==ids[i]) {
339                 int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2;
340                 if (x==0x49 && family==0xF && model==0x6) level=2;
341                 datacache[level].size=sizes[i];
342                 datacache[level].associativity=ways[i];
343                 if (level == 3 || x==0x2C || x==0x0D || (x>=0x48 && x<=0x80) 
344                                    || x==0x86 || x==0x87
345                                    || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E)){
346                     datacache[level].lineSize = 64;
347                 } else datacache[level].lineSize = 32;
348             }
349         }
350     }
351 
352     uint[4] a;  
353     bool firstTime = true;
354     // On a multi-core system, this could theoretically fail, but it's only used
355     // for old single-core CPUs.
356     uint numinfos = 1;
357     do {
358         asm {
359             mov EAX, 2;
360             cpuid;
361             mov a, EAX;
362             mov a+4, EBX;
363             mov a+8, ECX;
364             mov a+12, EDX;
365         }
366         if (firstTime) {
367             if (a[0]==0x0000_7001 && a[3]==0x80 && a[1]==0 && a[2]==0) {
368         // Cyrix MediaGX MMXEnhanced returns: EAX= 00007001, EDX=00000080.
369         // These are NOT standard Intel values
370         // (TLB = 32 entry, 4 way associative, 4K pages)
371         // (L1 cache = 16K, 4way, linesize16)
372                 datacache[0].size=8;
373                 datacache[0].associativity=4;
374                 datacache[0].lineSize=16;
375                 return;             
376             }
377             // lsb of a is how many times to loop.
378             numinfos = a[0] & 0xFF;
379             // and otherwise it should be ignored
380             a[0] &= 0xFFFF_FF00;
381             firstTime = false;
382         }
383         for (int c=0; c<4;++c) {
384             // high bit set == no info.
385             if (a[c] & 0x8000_0000) continue;
386             decipherCpuid2(cast(ubyte)(a[c] & 0xFF));
387             decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF));
388             decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF));
389             decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF));
390         }
391     } while (--numinfos);
392 }
393 
394 // CPUID4: "Deterministic cache parameters" leaf
395 void getcacheinfoCPUID4()
396 {
397     int cachenum = 0;
398     for(;;) {
399         uint a, b, number_of_sets;  
400         asm {
401             mov EAX, 4;
402             mov ECX, cachenum;
403             cpuid;
404             mov a, EAX;
405             mov b, EBX;
406             mov number_of_sets, ECX;
407         }
408         ++cachenum;
409         if ((a&0x1F)==0) break; // no more caches
410         uint numthreads = ((a>>14) & 0xFFF)  + 1;
411         uint numcores = ((a>>26) & 0x3F) + 1;
412         if (numcores > maxCores) maxCores = numcores;
413         if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches
414         
415         ++number_of_sets;
416         ubyte level = cast(ubyte)(((a>>5)&7)-1);
417         if (level > datacache.length) continue; // ignore deep caches
418         datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1);
419         datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size
420         uint line_partitions = ((b >> 12)& 0x3FF) + 1;
421         // Size = number of sets * associativity * cachelinesize * linepartitions
422         // and must convert to Kb, also dividing by the number of hyperthreads using this cache.
423         ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets *
424             datacache[level].associativity : number_of_sets;        
425         datacache[level].size = cast(uint)(
426                 (sz * datacache[level].lineSize * line_partitions ) / (numthreads *1024));
427         if (level == 0 && (a&0xF)==3) {
428             // Halve the size for unified L1 caches
429             datacache[level].size/=2;
430         }
431     }
432 }
433 
434 // CPUID8000_0005 & 6
435 void getAMDcacheinfo()
436 {
437     uint c5, c6, d6;
438     asm {
439         mov EAX, 0x8000_0005; // L1 cache
440         cpuid;
441         // EAX has L1_TLB_4M.
442         // EBX has L1_TLB_4K
443         // EDX has L1 instruction cache
444         mov c5, ECX;
445     }
446 
447     datacache[0].size = ( (c5>>24) & 0xFF);
448     datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF);
449     datacache[0].lineSize = c5 & 0xFF;
450 
451     if (max_extended_cpuid >= 0x8000_0006) {
452         // AMD K6-III or K6-2+ or later.
453         ubyte numcores = 1;
454         if (max_extended_cpuid >=0x8000_0008) {
455             asm {
456                 mov EAX, 0x8000_0008;
457                 cpuid;
458                 mov numcores, CL;
459             }
460             ++numcores;
461             if (numcores>maxCores) maxCores = numcores;
462         }
463         asm {
464             mov EAX, 0x8000_0006; // L2/L3 cache
465             cpuid;
466             mov c6, ECX; // L2 cache info
467             mov d6, EDX; // L3 cache info
468         }
469     
470         ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ];
471         datacache[1].size = (c6>>16) & 0xFFFF;
472         datacache[1].associativity = assocmap[(c6>>12)&0xF];
473         datacache[1].lineSize = c6 & 0xFF;
474         
475         // The L3 cache value is TOTAL, not per core.
476         datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1.
477         datacache[2].associativity = assocmap[(d6>>12)&0xF];
478         datacache[2].lineSize = d6 & 0xFF;
479     }
480 }
481 
482 // For Intel CoreI7 and later, use function 0x0B
483 // to determine number of processors.
484 void getCpuInfo0B()
485 {
486     int level=0;
487     uint a, b, c, d;
488     do {
489         asm {
490             mov EAX, 0x0B;
491             mov ECX, level;
492             cpuid;
493             mov a, EAX;
494             mov b, EBX;
495             mov c, ECX;
496             mov d, EDX;        
497         }
498         if (b!=0) {
499            // I'm not sure about this. The docs state that there
500            // are 2 hyperthreads per core if HT is factory enabled.
501             if (level==0) maxThreads = b & 0xFFFF;
502             else if (level==1) maxCores = b & 0xFFFF;
503             
504         }
505         ++level;
506     } while (a!=0 || b!=0);
507 }
508 
509 void cpuidX86()
510 {
511     char * venptr = vendorID.ptr;
512     uint a, b, c, d, a2;
513     asm {
514         mov EAX, 0;
515         cpuid;
516         mov a, EAX;
517         mov EAX, venptr;
518         mov [EAX], EBX;
519         mov [EAX + 4], EDX;
520         mov [EAX + 8], ECX;
521         mov EAX, 0x8000_0000;
522         cpuid;
523         mov a2, EAX;
524     }
525     
526     max_cpuid = a;
527     max_extended_cpuid = a2;
528     
529     probablyIntel = vendorID == "GenuineIntel";
530     probablyAMD = vendorID == "AuthenticAMD";
531     uint apic = 0; // brand index, apic id
532     asm {
533         mov EAX, 1; // model, stepping
534         cpuid;
535         mov a, EAX;
536         mov apic, EBX;
537         mov c, ECX;
538         mov d, EDX;
539     }
540     features = d;
541     miscfeatures = c;
542     amdfeatures = 0;
543     amdmiscfeatures = 0;
544     if (max_extended_cpuid >= 0x8000_0001) {
545         asm {
546             mov EAX, 0x8000_0001;
547             cpuid;
548             mov c, ECX;
549             mov d, EDX;
550         }
551         amdmiscfeatures = c;
552         amdfeatures = d;
553     }
554     // Try to detect fraudulent vendorIDs
555     if (amd3dnow) probablyIntel = false;
556     
557     stepping = a & 0xF;
558     uint fbase = (a >> 8) & 0xF;
559     uint mbase = (a >> 4) & 0xF;
560     family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase;
561     model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ?
562          mbase + ((a >> 12) & 0xF0) : mbase;
563          
564     if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) {
565         // determine max number of cores for AMD
566         asm {
567             mov EAX, 0x8000_0008;
568             cpuid;
569             mov c, ECX;
570         }
571         uint apicsize = (c>>12) & 0xF;
572         if (apicsize == 0) {
573             // use legacy method
574             if (hyperThreadingBit())  maxCores = c & 0xFF;
575             else maxCores = 1;
576         } else {
577             // maxcores = 2^ apicsize
578             maxCores = 1;
579             while (apicsize) { maxCores<<=1; --apicsize; }
580         }
581     }
582     
583     if (max_extended_cpuid >= 0x8000_0004) {
584         char *procptr = processorNameBuffer.ptr;
585         asm {
586             push ESI;
587             mov ESI, procptr;
588             mov EAX, 0x8000_0002;
589             cpuid;
590             mov [ESI], EAX;
591             mov [ESI+4], EBX;
592             mov [ESI+8], ECX;
593             mov [ESI+12], EDX;
594             mov EAX, 0x8000_0003;
595             cpuid;
596             mov [ESI+16], EAX;
597             mov [ESI+20], EBX;
598             mov [ESI+24], ECX;
599             mov [ESI+28], EDX;
600             mov EAX, 0x8000_0004;
601             cpuid;
602             mov [ESI+32], EAX;
603             mov [ESI+36], EBX;
604             mov [ESI+40], ECX;
605             mov [ESI+44], EDX;
606             pop ESI;            
607         }
608         // Intel P4 and PM pad at front with spaces.
609         // Other CPUs pad at end with nulls.
610         int start = 0, end = 0;
611         while (processorNameBuffer[start] == ' ') { ++start; }
612         while (processorNameBuffer[$-end-1] == 0) { ++end; }
613         processorName = processorNameBuffer[start..$-end].idup;
614     } else {
615         processorName = "Unknown CPU";
616     }
617     // Determine cache sizes
618     
619     // Intel docs specify that they return 0 for 0x8000_0005.
620     // AMD docs do not specify the behaviour for 0004 and 0002.
621     // Centaur/VIA and most other manufacturers use the AMD method,
622     // except Cyrix MediaGX MMX Enhanced uses their OWN form of CPUID2!
623     // NS Geode GX1 provides CyrixCPUID2 _and_ does the same wrong behaviour
624     // for CPUID80000005. But Geode GX uses the AMD method
625     
626     // Deal with idiotic Geode GX1 - make it same as MediaGX MMX.
627     if (max_extended_cpuid==0x8000_0005 && max_cpuid==2) {      
628         max_extended_cpuid = 0x8000_0004;
629     }
630     // Therefore, we try the AMD method unless it's an Intel chip.
631     // If we still have no info, try the Intel methods.
632     datacache[0].size = 0;
633     if (max_cpuid<2 || !probablyIntel) {
634         if (max_extended_cpuid >= 0x8000_0005) {
635             getAMDcacheinfo();
636         } else if (probablyAMD) {       
637             // According to AMDProcRecognitionAppNote, this means CPU
638             // K5 model 0, or Am5x86 (model 4), or Am4x86DX4 (model 4)
639             // Am5x86 has 16Kb 4-way unified data & code cache.
640             datacache[0].size = 8;
641             datacache[0].associativity = 4;
642             datacache[0].lineSize = 32;     
643         } else {
644             // Some obscure CPU.
645             // Values for Cyrix 6x86MX (family 6, model 0)
646             datacache[0].size = 64;
647             datacache[0].associativity = 4;
648             datacache[0].lineSize = 32;     
649         }
650     }   
651     if ((datacache[0].size == 0) && max_cpuid>=4) {
652         getcacheinfoCPUID4();
653     }
654     if ((datacache[0].size == 0) && max_cpuid>=2) {     
655         getcacheinfoCPUID2();
656     }
657     if (datacache[0].size == 0) {
658         // Pentium, PMMX, late model 486, or an obscure CPU
659         if (mmx) { // Pentium MMX. Also has 8kB code cache.
660             datacache[0].size = 16;
661             datacache[0].associativity = 4;
662             datacache[0].lineSize = 32;     
663         } else { // Pentium 1 (which also has 8kB code cache)
664                  // or 486.
665             // Cyrix 6x86: 16, 4way, 32 linesize
666             datacache[0].size = 8;
667             datacache[0].associativity = 2;
668             datacache[0].lineSize = 32;
669         }       
670     }
671     if (max_cpuid >=0x0B) {
672         // For Intel i7 and later, use function 0x0B to determine
673         // cores and hyperthreads.
674         getCpuInfo0B();    
675     } else {
676         if (hyperThreadingBit()) maxThreads = (apic>>>16) & 0xFF;
677         else maxThreads = maxCores;
678     }
679 }
680 
681 // Return true if the cpuid instruction is supported.
682 // BUG(WONTFIX): Returns false for Cyrix 6x86 and 6x86L. They will be treated as 486 machines.
683 bool hasCPUID()
684 {
685     uint flags;
686     asm {
687         pushfd;
688         pop EAX;
689         mov flags, EAX;
690         xor EAX, 0x0020_0000;
691         push EAX;
692         popfd;
693         pushfd;
694         pop EAX;
695         xor flags, EAX;
696     }
697     return (flags & 0x0020_0000) !=0;
698 }
699 
700 } else { // inline asm X86
701 
702     bool hasCPUID() { return false; }
703 
704     void cpuidX86()
705     {
706             datacache[0].size = 8;
707             datacache[0].associativity = 2;
708             datacache[0].lineSize = 32;     
709     }   
710 }
711 
712 // TODO: Implement this function with OS support
713 void cpuidPPC()
714 {
715     enum :int  { PPC601, PPC603, PPC603E, PPC604,
716                  PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }
717 
718     // TODO:
719     // asm { mfpvr; } returns the CPU version but unfortunately it can
720     // only be used in kernel mode. So OS support is required.
721     int cputype = PPC603;
722     
723     // 601 has a 8KB combined data & code L1 cache.
724     uint[] sizes = [4, 8, 16, 16, 32, 32, 32, 32, 64];
725     ubyte[] ways = [8, 2,  4,  4,  4,  8,  8,  8,  8];
726     uint[] L2size= [0, 0,  0,  0,  0,  0,  0,  256,  512];
727     uint[] L3size= [0, 0,  0,  0,  0,  0,  0,  2048,  0];
728     
729     datacache[0].size = sizes[cputype];
730     datacache[0].associativity = ways[cputype]; 
731     datacache[0].lineSize = (cputype==PPCG5)? 128 : 
732         (cputype == PPC620 || cputype == PPCG3)? 64 : 32;
733     datacache[1].size = L2size[cputype];
734     datacache[2].size = L3size[cputype];
735     datacache[1].lineSize = datacache[0].lineSize;
736     datacache[2].lineSize = datacache[0].lineSize;
737 }
738 
739 // TODO: Implement this function with OS support
740 void cpuidSparc()
741 {
742     // UltaSparcIIi  : L1 = 16,  2way. L2 = 512, 4 way.
743     // UltraSparcIII : L1 = 64,  4way. L2= 4096 or 8192.
744     // UltraSparcIIIi: L1 = 64,  4way. L2= 1024, 4 way
745     // UltraSparcIV  : L1 = 64,  4way. L2 = 16*1024.
746     // UltraSparcIV+ : L1 = 64,  4way. L2 = 2048, L3=32*1024.
747     // Sparc64V      : L1 = 128, 2way. L2 = 4096 4way.  
748 }
749 
750 
751 shared static this()
752 {
753     if (hasCPUID()) {
754         cpuidX86();
755     } else {
756         // it's a 386 or 486, or a Cyrix 6x86.
757         //Probably still has an external cache.
758     }
759     if (datacache[0].size==0) {
760             // Guess same as Pentium 1.
761             datacache[0].size = 8;
762             datacache[0].associativity = 2;
763             datacache[0].lineSize = 32;     
764     }
765     numCacheLevels = 1;
766     // And now fill up all the unused levels with full memory space.
767     for (int i=1; i< datacache.length; ++i) {
768         if (datacache[i].size==0) {
769             // Set all remaining levels of cache equal to full address space.
770             datacache[i].size = uint.max/1024;
771             datacache[i].associativity = 1;
772             datacache[i].lineSize = datacache[i-1].lineSize;
773         } else numCacheLevels = i+1;
774     }
775 }
776 
777 
778 
779 
780 debug (Cpuid)
781 {
782         private import tango.io.Stdout;
783 
784         void main()
785         {
786             Stdout.formatln ("{}, {} threads, {} cores", processor, threadsPerCPU, coresPerCPU);
787         }
788 }