1 /**
2  * The Atomic module is intended to provide some basic support for the so called lock-free
3  * concurrent programming.
4  * The current design replaces the previous Atomic module by Sean and is inspired
5  * partly by the llvm atomic operations, and Sean's version
6  *
7  * If no atomic ops are available an (inefficent) fallback solution is provided
8  * For classes atomic access means atomic access to their *address* not their content
9  *
10  * If you want unique counters or flags to communicate in multithreading settings
11  * look at tango.core.sync.Counter that provides them in a better way and handles
12  * better the absence of atomic ops.
13  *
14  * Copyright: Copyright (C) 2008-2010 the blip developer group
15  * License:   BSD style: $(LICENSE)
16  * Author:    Fawzi Mohamed
17  */
18 
19 module tango.core.sync.Atomic;
20 
21 version( LDC )
22 {
23     import ldc.intrinsics;
24 }
25 
26 private {
27     // from tango.core.traits:
28     /**
29      * Evaluates to true if T is a signed or unsigned integer type.
30      */
31     template isIntegerType( T )
32     {
33         const bool isIntegerType = isSignedIntegerType!(T) ||
34                                    isUnsignedIntegerType!(T);
35     }
36     /**
37      * Evaluates to true if T is a pointer type.
38      */
39     template isPointerOrClass(T)
40     {
41         const isPointerOrClass = is(T==class);
42     }
43 
44     template isPointerOrClass(T : T*)
45     {
46             const isPointerOrClass = true;
47     }
48     /**
49      * Evaluates to true if T is a signed integer type.
50      */
51     template isSignedIntegerType( T )
52     {
53         const bool isSignedIntegerType = is( T == byte )  ||
54                                          is( T == short ) ||
55                                          is( T == int )   ||
56                                          is( T == long )/+||
57                                          is( T == cent  )+/;
58     }
59     /**
60      * Evaluates to true if T is an unsigned integer type.
61      */
62     template isUnsignedIntegerType( T )
63     {
64         const bool isUnsignedIntegerType = is( T == ubyte )  ||
65                                            is( T == ushort ) ||
66                                            is( T == uint )   ||
67                                            is( T == ulong )/+||
68                                            is( T == ucent  )+/;
69     }
70 
71     /// substitutes classes with void*
72     template ClassPtr(T){
73         static if (is(T==class)){
74             alias void* ClassPtr;
75         } else {
76             alias T ClassPtr;
77         }
78     }
79 }
80 
81 extern(C) void thread_yield();
82 
83 // NOTE: Strictly speaking, the x86 supports atomic operations on
84 //       unaligned values.  However, this is far slower than the
85 //       common case, so such behavior should be prohibited.
86 template atomicValueIsProperlyAligned( T )
87 {
88     bool atomicValueIsProperlyAligned( size_t addr )
89     {
90         return addr % ClassPtr!(T).sizeof == 0;
91     }
92 }
93 
94 /*
95  * A barrier does not allow some kinds of intermixing and out of order execution
96  * and ensures that all operations of one kind are executed before the operations of the other type
97  * which kind of mixing are not allowed depends from the template arguments
98  * These are global barriers: the whole memory is synchronized (devices excluded if device is false)
99  *
100  * the actual barrier eforced might be stronger than the requested one
101  *
102  * if ll is true loads before the barrier are not allowed to mix with loads after the barrier
103  * if ls is true loads before the barrier are not allowed to mix with stores after the barrier
104  * if sl is true stores before the barrier are not allowed to mix with loads after the barrier
105  * if ss is true stores before the barrier are not allowed to mix with stores after the barrier
106  * if device is true als uncached and device memory is synchronized
107  *
108  * Barriers are typically paired
109  *
110  * For example if you want to ensure that all writes
111  * are done before setting a flags that communicates that an objects is initialized you would
112  * need memoryBarrier(false,false,false,true) before setting the flag.
113  * To read that flag before reading the rest of the object you would need a
114  * memoryBarrier(true,false,false,false) after having read the flag.
115  *
116  * I believe that these two barriers are called acquire and release, but you find several
117  * incompatible definitions around (some obviously wrong), so some care migth be in order
118  * To be safer memoryBarrier(false,true,false,true) might be used for acquire, and
119  * memoryBarrier(true,false,true,false) for release which are slighlty stronger.
120  *
121  * These barriers are also called write barrier and read barrier respectively.
122  *
123  * A full memory fence is (true,true,true,true) and ensures that stores and loads before the
124  * barrier are done before stores and loads after it.
125  * Keep in mind even with a full barrier you still normally need two of them, to avoid that the
126  * other process reorders loads (for example) and still sees things in the wrong order.
127 */
128 /* llvm_memory_barrier gone?
129 version( LDC )
130 {
131     void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
132 			  // XXX: this is overly conservative
133 			  llvm_memory_fence();
134     }
135 } else */version(D_InlineAsm_X86){
136     void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
137         static if (device) {
138             if (ls || sl || ll || ss){
139                 // cpid should sequence even more than mfence
140                 asm {
141                     push EBX;
142                     mov EAX, 0; // model, stepping
143                     cpuid;
144                     pop EBX;
145                 }
146             }
147         } else static if (ls || sl || (ll && ss)){ // use a sequencing operation like cpuid or simply cmpxch instead?
148             asm {
149                 mfence;
150             }
151             // this is supposedly faster and correct, but let's play it safe and use the specific instruction
152             // push rax
153             // xchg rax
154             // pop rax
155         } else static if (ll){
156             asm {
157                 lfence;
158             }
159         } else static if( ss ){
160             asm {
161                 sfence;
162             }
163         }
164     }
165 } else version(D_InlineAsm_X86_64){
166     void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
167         static if (device) {
168             if (ls || sl || ll || ss){
169                 // cpid should sequence even more than mfence
170                 asm {
171                     push RBX;
172                     mov RAX, 0; // model, stepping
173                     cpuid;
174                     pop RBX;
175                 }
176             }
177         } else static if (ls || sl || (ll && ss)){ // use a sequencing operation like cpuid or simply cmpxch instead?
178             asm {
179                 mfence;
180             }
181             // this is supposedly faster and correct, but let's play it safe and use the specific instruction
182             // push rax
183             // xchg rax
184             // pop rax
185         } else static if (ll){
186             asm {
187                 lfence;
188             }
189         } else static if( ss ){
190             asm {
191                 sfence;
192             }
193         }
194     }
195 } else {
196     pragma(msg,"WARNING: no atomic operations on this architecture");
197     pragma(msg,"WARNING: this is *slow* you probably want to change this!");
198     int dummy;
199     // acquires a lock... probably you will want to skip this
200     void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
201         synchronized { dummy=1; }
202     }
203     enum{LockVersion=true}
204 }
205 
206 static if (!is(typeof(LockVersion))) {
207     enum{LockVersion=false}
208 }
209 
210 // use stricter fences
211 enum{strictFences=false}
212 
213 /// Utility function for a write barrier (disallow store and store reorderig.)
214 void writeBarrier(){
215     memoryBarrier!(false,false,strictFences,true)();
216 }
217 /// Utility function for a read barrier (disallow load and load reorderig.)
218 void readBarrier(){
219     memoryBarrier!(true,strictFences,false,false)();
220 }
221 /// Utility function for a full barrier (disallow reorderig.)
222 void fullBarrier(){
223     memoryBarrier!(true,true,true,true)();
224 }
225 
226 /*
227  * Atomic swap.
228  * val and newval in one atomic operation
229  * barriers are not implied, just atomicity!
230 */
231 version(LDC){
232     T atomicSwap( T )( ref T val, T newval )
233     {
234         T oldval = void;
235         static if (isPointerOrClass!(T))
236         {
237             oldval = cast(T)llvm_atomic_swap!(size_t)(cast(shared(size_t)*)&val, cast(size_t)newval);
238         }
239         else static if (is(T == bool))
240         {
241             oldval = llvm_atomic_swap!(ubyte)(cast(shared(ubyte)*)&val, newval?1:0)?0:1;
242         }
243         else
244         {
245             oldval = llvm_atomic_swap!(T)(cast(shared)&val, newval);
246         }
247         return oldval;
248     }
249 } else version(D_InlineAsm_X86) {
250     T atomicSwap( T )( ref T val, T newval )
251     in {
252         // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
253         //       4 byte alignment, so use size_t as the align type here.
254         static if( T.sizeof > size_t.sizeof )
255             assert( atomicValueIsProperlyAligned!(size_t)( cast(size_t) &val ) );
256         else
257             assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
258     } body {
259         T*posVal=&val;
260         static if( T.sizeof == byte.sizeof ) {
261             asm {
262                 mov AL, newval;
263                 mov ECX, posVal;
264                 lock; // lock always needed to make this op atomic
265                 xchg [ECX], AL;
266             }
267         }
268         else static if( T.sizeof == short.sizeof ) {
269             asm {
270                 mov AX, newval;
271                 mov ECX, posVal;
272                 lock; // lock always needed to make this op atomic
273                 xchg [ECX], AX;
274             }
275         }
276         else static if( T.sizeof == int.sizeof ) {
277             asm {
278                 mov EAX, newval;
279                 mov ECX, posVal;
280                 lock; // lock always needed to make this op atomic
281                 xchg [ECX], EAX;
282             }
283         }
284         else static if( T.sizeof == long.sizeof ) {
285             // 8 Byte swap on 32-Bit Processor, use CAS?
286             static assert( false, "Invalid template type specified, 8bytes in 32 bit mode: "~T.stringof );
287         }
288         else
289         {
290             static assert( false, "Invalid template type specified: "~T.stringof );
291         }
292     }
293 } else version (D_InlineAsm_X86_64){
294     T atomicSwap( T )( ref T val, T newval )
295     in {
296         assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
297     } body {
298         T*posVal=&val;
299         static if( T.sizeof == byte.sizeof ) {
300             asm {
301                 mov AL, newval;
302                 mov RCX, posVal;
303                 lock; // lock always needed to make this op atomic
304                 xchg [RCX], AL;
305             }
306         }
307         else static if( T.sizeof == short.sizeof ) {
308             asm {
309                 mov AX, newval;
310                 mov RCX, posVal;
311                 lock; // lock always needed to make this op atomic
312                 xchg [RCX], AX;
313             }
314         }
315         else static if( T.sizeof == int.sizeof ) {
316             asm {
317                 mov EAX, newval;
318                 mov RCX, posVal;
319                 lock; // lock always needed to make this op atomic
320                 xchg [RCX], EAX;
321             }
322         }
323         else static if( T.sizeof == long.sizeof ) {
324             asm {
325                 mov RAX, newval;
326                 mov RCX, posVal;
327                 lock; // lock always needed to make this op atomic
328                 xchg [RCX], RAX;
329             }
330         }
331         else
332         {
333             static assert( false, "Invalid template type specified: "~T.stringof );
334         }
335     }
336 } else {
337     T atomicSwap( T )( ref T val, T newval )
338     in {
339         assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
340     } body {
341         T oldVal;
342         synchronized(typeid(T)){
343             oldVal=val;
344             val=newval;
345         }
346         return oldVal;
347     }
348 }
349 
350 //---------------------
351 // internal conversion template
352 private T aCasT(T,V)(ref T val, T newval, T equalTo){
353     union UVConv{V v; T t;}
354     union UVPtrConv{V *v; T *t;}
355     UVConv vNew,vOld,vAtt;
356     UVPtrConv valPtr;
357     vNew.t=newval;
358     vOld.t=equalTo;
359     valPtr.t=&val;
360     vAtt.v=atomicCAS(*valPtr.v,vNew.v,vOld.v);
361     return vAtt.t;
362 }
363 /// internal reduction
364 private T aCas(T)(ref T val, T newval, T equalTo){
365     static if (T.sizeof==1){
366         return aCasT!(T,ubyte)(val,newval,equalTo);
367     } else static if (T.sizeof==2){
368         return aCasT!(T,ushort)(val,newval,equalTo);
369     } else static if (T.sizeof==4){
370         return aCasT!(T,uint)(val,newval,equalTo);
371     } else static if (T.sizeof==8){ // unclear if it is always supported...
372         return aCasT!(T,ulong)(val,newval,equalTo);
373     } else {
374         static assert(0,"invalid type "~T.stringof);
375     }
376 }
377 
378 /*
379  * Atomic compare & exchange (can be used to implement everything else)
380  * stores newval into val if val==equalTo in one atomic operation.
381  * Barriers are not implied, just atomicity!
382  * Returns the value that is checked against equalTo (i.e. an exchange was performed
383  * if result==equalTo, otherwise one can use the result as the current value).
384 */
385 version(LDC){
386     T atomicCAS( T )( ref T val, T newval, T equalTo )
387     {
388         T oldval = void;
389         static if (isPointerOrClass!(T))
390         {
391             oldval = cast(T)cast(void*)llvm_atomic_cmp_swap!(size_t)(cast(shared(size_t)*)cast(void*)&val, cast(size_t)cast(void*)equalTo, cast(size_t)cast(void*)newval);
392         }
393         else static if (is(T == bool)) // correct also if bol has different size?
394         {
395             oldval = aCas(val,newval,equalTo); // assuming true is *always* 1 and not a non zero value...
396         }
397         else static if (isIntegerType!(T))
398         {
399             oldval = llvm_atomic_cmp_swap!(T)(cast(shared)&val, equalTo, newval);
400         } else {
401             oldval = aCas(val,newval,equalTo);
402         }
403         return oldval;
404     }
405 } else version(D_InlineAsm_X86) {
406     version(OSX){
407         extern(C) ubyte OSAtomicCompareAndSwap64(long oldValue, long newValue,
408                  long *theValue); // assumes that in C sizeof(_Bool)==1 (as given in osx IA-32 ABI)
409     }
410     T atomicCAS( T )( ref T val, T newval, T equalTo )
411     in {
412         // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
413         //       4 byte alignment, so use size_t as the align type here.
414         static if( ClassPtr!(T).sizeof > size_t.sizeof )
415             assert( atomicValueIsProperlyAligned!(size_t)( cast(size_t) &val ) );
416         else
417             assert( atomicValueIsProperlyAligned!(ClassPtr!(T))( cast(size_t) &val ) );
418     } body {
419         T*posVal=&val;
420         static if( T.sizeof == byte.sizeof ) {
421             asm {
422                 mov DL, newval;
423                 mov AL, equalTo;
424                 mov ECX, posVal;
425                 lock; // lock always needed to make this op atomic
426                 cmpxchg [ECX], DL;
427             }
428         }
429         else static if( T.sizeof == short.sizeof ) {
430             asm {
431                 mov DX, newval;
432                 mov AX, equalTo;
433                 mov ECX, posVal;
434                 lock; // lock always needed to make this op atomic
435                 cmpxchg [ECX], DX;
436             }
437         }
438         else static if( ClassPtr!(T).sizeof == int.sizeof ) {
439             asm {
440                 mov EDX, newval;
441                 mov EAX, equalTo;
442                 mov ECX, posVal;
443                 lock; // lock always needed to make this op atomic
444                 cmpxchg [ECX], EDX;
445             }
446         }
447         else static if( T.sizeof == long.sizeof ) {
448             // 8 Byte StoreIf on 32-Bit Processor
449             version(OSX){
450                 union UVConv{long v; T t;}
451                 union UVPtrConv{long *v; T *t;}
452                 UVConv vEqual,vNew;
453                 UVPtrConv valPtr;
454                 vEqual.t=equalTo;
455                 vNew.t=newval;
456                 valPtr.t=&val;
457                 while(1){
458                     if(OSAtomicCompareAndSwap64(vEqual.v, vNew.v, valPtr.v)!=0)
459                     {
460                         return equalTo;
461                     } else {
462                         {
463                             T res=val;
464                             if (res!is equalTo) return res;
465                         }
466                     }
467                 }
468             } else {
469                 T res;
470                 asm
471                 {
472                     push EDI;
473                     push EBX;
474                     lea EDI, newval;
475                     mov EBX, [EDI];
476                     mov ECX, 4[EDI];
477                     lea EDI, equalTo;
478                     mov EAX, [EDI];
479                     mov EDX, 4[EDI];
480                     mov EDI, val;
481                     lock; // lock always needed to make this op atomic
482                     cmpxchg8b [EDI];
483                     lea EDI, res;
484                     mov [EDI], EAX;
485                     mov 4[EDI], EDX;
486                     pop EBX;
487                     pop EDI;
488                 }
489                 return res;
490             }
491         }
492         else
493         {
494             static assert( false, "Invalid template type specified: "~T.stringof );
495         }
496     }
497 } else version (D_InlineAsm_X86_64){
498     T atomicCAS( T )( ref T val, T newval, T equalTo )
499     in {
500         assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
501     } body {
502         T*posVal=&val;
503         static if( T.sizeof == byte.sizeof ) {
504             asm {
505                 mov DL, newval;
506                 mov AL, equalTo;
507                 mov RCX, posVal;
508                 lock; // lock always needed to make this op atomic
509                 cmpxchg [RCX], DL;
510             }
511         }
512         else static if( T.sizeof == short.sizeof ) {
513             asm {
514                 mov DX, newval;
515                 mov AX, equalTo;
516                 mov RCX, posVal;
517                 lock; // lock always needed to make this op atomic
518                 cmpxchg [RCX], DX;
519             }
520         }
521         else static if( ClassPtr!(T).sizeof == int.sizeof ) {
522             asm {
523                 mov EDX, newval;
524                 mov EAX, equalTo;
525                 mov RCX, posVal;
526                 lock; // lock always needed to make this op atomic
527                 cmpxchg [RCX], EDX;
528             }
529         }
530         else static if( ClassPtr!(T).sizeof == long.sizeof ) {
531             asm {
532                 mov RDX, newval;
533                 mov RAX, equalTo;
534                 mov RCX, posVal;
535                 lock; // lock always needed to make this op atomic
536                 cmpxchg [RCX], RDX;
537             }
538         }
539         else
540         {
541             static assert( false, "Invalid template type specified: "~T.stringof );
542         }
543     }
544 } else {
545     T atomicCAS( T )( ref T val, T newval, T equalTo )
546     in {
547         assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
548     } body {
549         T oldval;
550         synchronized(typeid(T)){
551             oldval=val;
552             if(oldval==equalTo) {
553                 val=newval;
554             }
555         }
556         return oldval;
557     }
558 }
559 
560 bool atomicCASB(T)( ref T val, T newval, T equalTo ){
561     return (equalTo is atomicCAS(val,newval,equalTo));
562 }
563 
564 /*
565  * Loads a value from memory.
566  *
567  * At the moment it is assumed that all aligned memory accesses are atomic
568  * in the sense that all bits are consistent with some store.
569  *
570  * Remove this? I know no actual architecture where this would be different.
571 */
572 T atomicLoad(T)(ref T val)
573 in {
574     assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
575     static assert(ClassPtr!(T).sizeof<=size_t.sizeof,"invalid size for "~T.stringof);
576 } body {
577     T res=val;
578     return res;
579 }
580 
581 /*
582  * Stores a value the the memory.
583  *
584  * At the moment it is assumed that all aligned memory accesses are atomic
585  * in the sense that a load either sees the complete store or the previous value.
586  *
587  * Remove this? I know no actual architecture where this would be different.
588 */
589 void atomicStore(T)(ref T val, T newVal)
590 in {
591         assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ), "invalid alignment" );
592         static assert(ClassPtr!(T).sizeof<=size_t.sizeof,"invalid size for "~T.stringof);
593 } body {
594     val=newVal;
595 }
596 
597 /*
598  * Increments the given value and returns the previous value with an atomic operation.
599  *
600  * Some architectures might allow just increments/decrements by 1.
601  * No barriers implied, only atomicity!
602 */
603 version(LDC){
604     T atomicAdd(T)(ref T val, T incV){
605         static if (isPointerOrClass!(T))
606         {
607             return cast(T)llvm_atomic_load_add!(size_t)(cast(shared(size_t)*)&val, incV);
608         }
609         else static if (isIntegerType!(T))
610         {
611             static assert( isIntegerType!(T), "invalid type "~T.stringof );
612             return llvm_atomic_load_add!(T)(cast(shared)&val, cast(T)incV);
613         } else {
614             return atomicOp(val,delegate T(T a){ return a+incV; });
615         }
616     }
617 } else version (D_InlineAsm_X86){
618     T atomicAdd(T,U=T)(ref T val, U incV_){
619         T incV=cast(T)incV_;
620         static if (isIntegerType!(T)||isPointerOrClass!(T)){
621             T* posVal=&val;
622             T res;
623             static if (T.sizeof==1){
624                 asm {
625                     mov DL, incV;
626                     mov ECX, posVal;
627                     lock;
628                     xadd byte ptr [ECX],DL;
629                     mov byte ptr res[EBP],DL;
630                 }
631             } else static if (T.sizeof==2){
632                 asm {
633                     mov DX, incV;
634                     mov ECX, posVal;
635                     lock;
636                     xadd short ptr [ECX],DX;
637                     mov short ptr res[EBP],DX;
638                 }
639             } else static if (T.sizeof==4){
640                 asm
641                 {
642                     mov EDX, incV;
643                     mov ECX, posVal;
644                     lock;
645                     xadd int ptr [ECX],EDX;
646                     mov int ptr res[EBP],EDX;
647                 }
648             } else static if (T.sizeof==8){
649                 return atomicOp(val,delegate (T x){ return x+incV; });
650             } else {
651                 static assert(0,"Unsupported type size");
652             }
653             return res;
654         } else {
655             return atomicOp(val,delegate T(T a){ return a+incV; });
656         }
657     }
658 } else version (D_InlineAsm_X86_64){
659     T atomicAdd(T,U=T)(ref T val, U incV_){
660         T incV=cast(T)incV_;
661         static if (isIntegerType!(T)||isPointerOrClass!(T)){
662             T* posVal=&val;
663             T res;
664             static if (T.sizeof==1){
665                 asm {
666                     mov DL, incV;
667                     mov RCX, posVal;
668                     lock;
669                     xadd byte ptr [RCX],DL;
670                     mov byte ptr res[EBP],DL;
671                 }
672             } else static if (T.sizeof==2){
673                 asm {
674                     mov DX, incV;
675                     mov RCX, posVal;
676                     lock;
677                     xadd short ptr [RCX],DX;
678                     mov short ptr res[EBP],DX;
679                 }
680             } else static if (T.sizeof==4){
681                 asm
682                 {
683                     mov EDX, incV;
684                     mov RCX, posVal;
685                     lock;
686                     xadd int ptr [RCX],EDX;
687                     mov int ptr res[EBP],EDX;
688                 }
689             } else static if (T.sizeof==8){
690                 asm
691                 {
692                     mov RAX, val;
693                     mov RDX, incV;
694                     lock; // lock always needed to make this op atomic
695                     xadd qword ptr [RAX],RDX;
696                     mov res[EBP],RDX;
697                 }
698             } else {
699                 static assert(0,"Unsupported type size for type:"~T.stringof);
700             }
701             return res;
702         } else {
703             return atomicOp(val,delegate T(T a){ return a+incV; });
704         }
705     }
706 } else {
707     static if (LockVersion){
708         T atomicAdd(T,U=T)(ref T val, U incV_){
709             T incV=cast(T)incV_;
710             static assert( isIntegerType!(T)||isPointerOrClass!(T),"invalid type: "~T.stringof );
711             synchronized(typeid(T)){
712                 T oldV=val;
713                 val+=incV;
714                 return oldV;
715             }
716         }
717     } else {
718         T atomicAdd(T,U=T)(ref T val, U incV_){
719             T incV=cast(T)incV_;
720             static assert( isIntegerType!(T)||isPointerOrClass!(T),"invalid type: "~T.stringof );
721             synchronized(typeid(T)){
722                 T oldV,newVal,nextVal;
723                 nextVal=val;
724                 do{
725                     oldV=nextVal;
726                     newV=oldV+incV;
727                     auto nextVal=atomicCAS!(T)(val,newV,oldV);
728                 } while(nextVal!=oldV);
729                 return oldV;
730             }
731         }
732     }
733 }
734 
735 /*
736  * Applies a pure function atomically.
737  * The function should be pure as it might be called several times to ensure atomicity
738  * The function should take a short time to compute otherwise contention is possible
739  * and no "fair" share is applied between fast function (more likely to succeed) and
740  * the others (i.e. do not use this in case of high contention).
741 */
742 T atomicOp(T)(ref T val, T delegate(T) f){
743     T oldV,newV,nextV;
744     int i=0;
745     nextV=val;
746     do {
747         oldV=nextV;
748         newV=f(oldV);
749         nextV=aCas!(T)(val,newV,oldV);
750         if (nextV is oldV || newV is oldV) return oldV;
751     } while(++i<200);
752     while (true){
753         thread_yield();
754         oldV=val;
755         newV=f(oldV);
756         nextV=aCas!(T)(val,newV,oldV);
757         if (nextV is oldV || newV is oldV) return oldV;
758     }
759 }
760 
761 /*
762  * Reads a flag (ensuring that other accesses can not happen before you read it).
763 */
764 T flagGet(T)(ref T flag){
765     T res;
766     res=flag;
767     memoryBarrier!(true,false,strictFences,false)();
768     return res;
769 }
770 
771 /*
772  * Sets a flag (ensuring that all pending writes are executed before this).
773  * the original value is returned.
774 */
775 T flagSet(T)(ref T flag,T newVal){
776     memoryBarrier!(false,strictFences,false,true)();
777     return atomicSwap(flag,newVal);
778 }
779 
780 /*
781  * Writes a flag (ensuring that all pending writes are executed before this).
782  * the original value is returned.
783 */
784 T flagOp(T)(ref T flag,T delegate(T) op){
785     memoryBarrier!(false,strictFences,false,true)();
786     return atomicOp(flag,op);
787 }
788 
789 /*
790  * Reads a flag (ensuring that all pending writes are executed before this).
791 */
792 T flagAdd(T)(ref T flag,T incV=cast(T)1){
793     static if (!LockVersion)
794         memoryBarrier!(false,strictFences,false,true)();
795     return atomicAdd(flag,incV);
796 }
797 
798 /*
799  * Returns the value of val and increments it in one atomic operation
800  * useful for counters, and to generate unique values (fast)
801  * no barriers are implied.
802 */
803 T nextValue(T)(ref T val){
804     return atomicAdd(val,cast(T)1);
805 }