okay, using -O1 or -O2 or -O3 produces working code.... so its like it related to -Os 
so I did some more playing around. using -fverbose-asm on compiling
we get this info.
; GNU C++ (GCC) version 4.8.0 20130306 (experimental) (avr)
; compiled by GNU C version 3.4.5 (mingw-vista special r2), GMP version 4.3.2, MPFR version 2.4.2, MPC version 0.8.2
; GGC heuristics: --param ggc-min-expand=30 --param ggc-min-heapsize=4096
; options passed: -fpreprocessed sketch_feb19a.ii -mmcu=atmega328p
; -mrelax
; -auxbase-strip D:\tmp\build2743407637356413609.tmp\sketch_feb19a.cpp.o
; -gstabs -Os -Wall -fverbose-asm -fno-optimize-strlen
; -fno-rtti -fno-enforce-eh-specs -fno-exceptions
; options enabled: -faggressive-loop-optimizations -fauto-inc-dec
; -fbranch-count-reg -fcaller-saves -fcombine-stack-adjustments -fcommon
; -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps
; -fdefer-pop -fdevirtualize -fdwarf2-cfi-asm -fearly-inlining
; -feliminate-unused-debug-types -fexpensive-optimizations
; -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
; -fguess-branch-probability -fhoist-adjacent-loads -fident
; -fif-conversion -fif-conversion2 -findirect-inlining -finline
; -finline-atomics -finline-functions -finline-functions-called-once
; -finline-small-functions -fipa-cp -fipa-profile -fipa-pure-const
; -fipa-reference -fipa-sra -fira-hoist-pressure -fira-share-save-slots
; -fira-share-spill-slots -fivopts -fkeep-static-consts
; -fleading-underscore -fmath-errno -fmerge-constants
; -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
; -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining
; -fpeephole -fpeephole2 -fprefetch-loop-arrays -freg-struct-return
; -fregmove -freorder-blocks -freorder-functions -frerun-cse-after-loop
; -fsched-critical-path-heuristic -fsched-dep-count-heuristic
; -fsched-group-heuristic -fsched-interblock -fsched-last-insn-heuristic
; -fsched-rank-heuristic -fsched-spec -fsched-spec-insn-heuristic
; -fsched-stalled-insns-dep -fshow-column -fshrink-wrap -fsigned-zeros
; -fsplit-ivs-in-unroller -fsplit-wide-types -fstrict-aliasing
; -fstrict-overflow -fstrict-volatile-bitfields -fsync-libcalls
; -fthread-jumps -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp
; -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-coalesce-vars
; -ftree-copy-prop -ftree-copyrename -ftree-dce -ftree-dominator-opts
; -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-if-convert
; -ftree-loop-im -ftree-loop-ivcanon -ftree-loop-optimize
; -ftree-parallelize-loops= -ftree-phiprop -ftree-pre -ftree-pta
; -ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize
; -ftree-slsr -ftree-sra -ftree-switch-conversion -ftree-tail-merge
; -ftree-ter -ftree-vect-loop-version -ftree-vrp -funit-at-a-time
; -fverbose-asm -fzero-initialized-in-bss -mrelax
for the above -Os i've also stopped its optimizing_strlen. and for the below -O2 option, i had to manually add in the -finline-functions. so that the compiler uses the same options..... or so we hope !
; GNU C++ (GCC) version 4.8.0 20130306 (experimental) (avr)
; compiled by GNU C version 3.4.5 (mingw-vista special r2), GMP version 4.3.2, MPFR version 2.4.2, MPC version 0.8.2
; GGC heuristics: --param ggc-min-expand=30 --param ggc-min-heapsize=4096
; options passed: -fpreprocessed sketch_feb19a.ii -mmcu=atmega328p
; -mrelax
; -auxbase-strip D:\tmp\build2743407637356413609.tmp\sketch_feb19a.cpp.o
; -gstabs -O2 -Wall -fverbose-asm -fno-optimize-strlen -finline-functions
; -fno-rtti -fno-enforce-eh-specs -fno-exceptions
; options enabled: -faggressive-loop-optimizations -fauto-inc-dec
; -fbranch-count-reg -fcaller-saves -fcombine-stack-adjustments -fcommon
; -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps
; -fdefer-pop -fdevirtualize -fdwarf2-cfi-asm -fearly-inlining
; -feliminate-unused-debug-types -fexpensive-optimizations
; -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
; -fguess-branch-probability -fhoist-adjacent-loads -fident
; -fif-conversion -fif-conversion2 -findirect-inlining -finline
; -finline-atomics -finline-functions -finline-functions-called-once
; -finline-small-functions -fipa-cp -fipa-profile -fipa-pure-const
; -fipa-reference -fipa-sra -fira-hoist-pressure -fira-share-save-slots
; -fira-share-spill-slots -fivopts -fkeep-static-consts
; -fleading-underscore -fmath-errno -fmerge-constants
; -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
; -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining
; -fpeephole -fpeephole2 -fprefetch-loop-arrays -freg-struct-return
; -fregmove -freorder-blocks -freorder-functions -frerun-cse-after-loop
; -fsched-critical-path-heuristic -fsched-dep-count-heuristic
; -fsched-group-heuristic -fsched-interblock -fsched-last-insn-heuristic
; -fsched-rank-heuristic -fsched-spec -fsched-spec-insn-heuristic
; -fsched-stalled-insns-dep -fshow-column -fshrink-wrap -fsigned-zeros
; -fsplit-ivs-in-unroller -fsplit-wide-types -fstrict-aliasing
; -fstrict-overflow -fstrict-volatile-bitfields -fsync-libcalls
; -fthread-jumps -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp
; -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-coalesce-vars
; -ftree-copy-prop -ftree-copyrename -ftree-dce -ftree-dominator-opts
; -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-if-convert
; -ftree-loop-im -ftree-loop-ivcanon -ftree-loop-optimize
; -ftree-parallelize-loops= -ftree-phiprop -ftree-pre -ftree-pta
; -ftree-reassoc -ftree-scev-cprop -ftree-sink -ftree-slp-vectorize
; -ftree-slsr -ftree-sra -ftree-switch-conversion -ftree-tail-merge
; -ftree-ter -ftree-vect-loop-version -ftree-vrp -funit-at-a-time
; -fverbose-asm -fzero-initialized-in-bss -mrelax
but its produces broken code still for the -Os use.
heres the two outputs
.LM1:
.LFBB2:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
.LBB2:
.stabn 68,0,9,.LM2-.LFBB2
.LM2:
lds r24,flag ; flag.3, flag
ldi r25,lo8(1) ; tmp48,
eor r24,r25 ; flag.3, tmp48
sts flag,r24 ; flag, flag.3
.stabn 68,0,44,.LM3-.LFBB2
.LM3:
cpse r24,__zero_reg__ ; flag.3,
rjmp .L6 ;
.LBB3:
.stabn 68,0,36,.LM4-.LFBB2
.LM4:
/* #APP */
; 36 "sketch_feb19a.ino" 1
.pushsection .progmem.data, "SM", @progbits, 1
999: .string "false"
.popsection
; 0 "" 2
.stabn 68,0,42,.LM5-.LFBB2
.LM5:
; 42 "sketch_feb19a.ino" 1
ldi r24, lo8(999b) ; iftmp.4
ldi r25, hi8(999b) ; iftmp.4
; 0 "" 2
/* #NOAPP */
.LBE3:
.stabn 68,0,44,.LM6-.LFBB2
.LM6:
sts result+1,r25 ; result, iftmp.4
sts result,r24 ; result, iftmp.4
ret
.L6:
.LBB4:
.stabn 68,0,20,.LM7-.LFBB2
.LM7:
/* #APP */
; 20 "sketch_feb19a.ino" 1
.pushsection .progmem.data, "SM", @progbits, 1
999: .string "true"
.popsection
; 0 "" 2
.stabn 68,0,26,.LM8-.LFBB2
.LM8:
; 26 "sketch_feb19a.ino" 1
ldi r24, lo8(999b) ; iftmp.4
ldi r25, hi8(999b) ; iftmp.4
; 0 "" 2
/* #NOAPP */
.LBE4:
.stabn 68,0,44,.LM9-.LFBB2
.LM9:
sts result+1,r25 ; result, iftmp.4
sts result,r24 ; result, iftmp.4
ret
.LBE2:
.size loop, .-loop
and .....
.LM1:
.LFBB2:
/* prologue: function */
/* frame size = 0 */
/* stack size = 0 */
.L__stack_usage = 0
.LBB2:
.stabn 68,0,9,.LM2-.LFBB2
.LM2:
lds r24,flag ; flag.3, flag
ldi r25,lo8(1) ; tmp48,
eor r24,r25 ; flag.3, tmp48
sts flag,r24 ; flag, flag.3
.stabn 68,0,44,.LM3-.LFBB2
.LM3:
tst r24 ; flag.3
breq .L3 ; ,
.LBB3:
.stabn 68,0,20,.LM4-.LFBB2
.LM4:
/* #APP */
; 20 "sketch_feb19a.ino" 1
.pushsection .progmem.data, "SM", @progbits, 1
999: .string "true"
.popsection
; 0 "" 2
/* #NOAPP */
rjmp .L5 ; <<<<<<<<<<<<<<<<<<<<< bad code !
.L3:
.LBE3:
.LBB4:
.stabn 68,0,36,.LM5-.LFBB2
.LM5:
/* #APP */
; 36 "sketch_feb19a.ino" 1
.pushsection .progmem.data, "SM", @progbits, 1
999: .string "false"
.popsection
; 0 "" 2
/* #NOAPP */
.L5:
.stabn 68,0,42,.LM6-.LFBB2
.LM6:
/* #APP */
; 42 "sketch_feb19a.ino" 1
ldi r24, lo8(999b) ; iftmp.4
ldi r25, hi8(999b) ; iftmp.4
; 0 "" 2
/* #NOAPP */
.LBE4:
.stabn 68,0,44,.LM7-.LFBB2
.LM7:
sts result+1,r25 ; result, iftmp.4
sts result,r24 ; result, iftmp.4
ret
.LBE2:
.size loop, .-loop
note how in one case case it swapped the true and false blocks, and alose does the test of the flag at .LM3 in both case differently.
