changeset 54:1257c74c33fd

Perf-tuning trimmed a bunch from various sections.. more to come
author Sean Halle <seanhalle@yahoo.com>
date Tue, 19 Jun 2012 16:18:34 -0700
parents a358d611a1a7
children 34407b0009a9
files 0__Papers/Future_Architecture/figures/Substitute_instr_with_firm-ware.svg 0__Papers/Holistic_Model/Perf_Tune/figures/UCC_concreteness_grid.pdf 0__Papers/Holistic_Model/Perf_Tune/figures/UCC_concreteness_grid.svg 0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex
diffstat 4 files changed, 580 insertions(+), 512 deletions(-) [+]
line diff
     1.1 --- a/0__Papers/Future_Architecture/figures/Substitute_instr_with_firm-ware.svg	Tue Jun 05 07:28:32 2012 -0700
     1.2 +++ b/0__Papers/Future_Architecture/figures/Substitute_instr_with_firm-ware.svg	Tue Jun 19 16:18:34 2012 -0700
     1.3 @@ -860,10 +860,10 @@
     1.4       inkscape:pageopacity="0.0"
     1.5       inkscape:pageshadow="2"
     1.6       inkscape:zoom="1.97"
     1.7 -     inkscape:cx="279.34259"
     1.8 -     inkscape:cy="914.39898"
     1.9 +     inkscape:cx="200.15477"
    1.10 +     inkscape:cy="845.46497"
    1.11       inkscape:document-units="px"
    1.12 -     inkscape:current-layer="g3145"
    1.13 +     inkscape:current-layer="g3247"
    1.14       showgrid="false"
    1.15       inkscape:window-width="1600"
    1.16       inkscape:window-height="848"
    1.17 @@ -878,7 +878,7 @@
    1.18          <dc:format>image/svg+xml</dc:format>
    1.19          <dc:type
    1.20             rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
    1.21 -        <dc:title></dc:title>
    1.22 +        <dc:title />
    1.23        </cc:Work>
    1.24      </rdf:RDF>
    1.25    </metadata>
    1.26 @@ -914,219 +914,270 @@
    1.27           id="tspan5325">Language Runtime as Firm-ware</tspan></text>
    1.28      <g
    1.29         id="g3161">
    1.30 -      <path
    1.31 -         style="fill:none;stroke:#000000;stroke-width:2.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
    1.32 -         d="m 269.66863,174.15449 18.93492,11.56848 0,60.27785 -18.93492,11.56847 0,-84.02367"
    1.33 -         id="path3702"
    1.34 -         inkscape:connector-curvature="0"
    1.35 -         sodipodi:nodetypes="ccccc" />
    1.36        <g
    1.37 -         transform="translate(137.23188,-354.82812)"
    1.38 -         id="g3503">
    1.39 -        <text
    1.40 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
    1.41 -           sodipodi:linespacing="100%"
    1.42 -           id="text3505"
    1.43 -           y="220.9987"
    1.44 -           x="-689.91699"
    1.45 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    1.46 -           xml:space="preserve"><tspan
    1.47 -             id="tspan3507"
    1.48 +         id="g3247"
    1.49 +         transform="matrix(0,1,-1,0,532.14214,-103.86285)">
    1.50 +        <path
    1.51 +           sodipodi:nodetypes="ccccc"
    1.52 +           inkscape:connector-curvature="0"
    1.53 +           id="path3702"
    1.54 +           d="m 269.66863,174.15449 18.93492,11.56848 0,60.27785 -18.93492,11.56847 0,-84.02367"
    1.55 +           style="fill:none;stroke:#000000;stroke-width:2.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
    1.56 +        <g
    1.57 +           id="g3503"
    1.58 +           transform="translate(137.23188,-354.82812)">
    1.59 +          <text
    1.60 +             xml:space="preserve"
    1.61 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    1.62 +             x="-689.91699"
    1.63 +             y="220.9987"
    1.64 +             id="text3505"
    1.65 +             sodipodi:linespacing="100%"
    1.66 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
    1.67 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
    1.68 +               sodipodi:role="line"
    1.69 +               x="-689.91699"
    1.70 +               y="220.9987"
    1.71 +               id="tspan3507">Execute</tspan></text>
    1.72 +          <rect
    1.73 +             y="541.71765"
    1.74 +             x="248.55019"
    1.75 +             height="58.297375"
    1.76 +             width="29.62429"
    1.77 +             id="rect3509"
    1.78 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
    1.79 +        </g>
    1.80 +        <g
    1.81 +           transform="translate(177.23188,-354.82812)"
    1.82 +           id="g3517">
    1.83 +          <text
    1.84 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
    1.85 +             sodipodi:linespacing="100%"
    1.86 +             id="text3519"
    1.87               y="220.9987"
    1.88               x="-689.91699"
    1.89 -             sodipodi:role="line"
    1.90 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Execute</tspan></text>
    1.91 -        <rect
    1.92 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
    1.93 -           id="rect3509"
    1.94 -           width="29.62429"
    1.95 -           height="58.297375"
    1.96 -           x="248.55019"
    1.97 -           y="541.71765" />
    1.98 -      </g>
    1.99 -      <g
   1.100 -         id="g3517"
   1.101 -         transform="translate(177.23188,-354.82812)">
   1.102 -        <text
   1.103 -           xml:space="preserve"
   1.104 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.105 -           x="-689.91699"
   1.106 -           y="220.9987"
   1.107 -           id="text3519"
   1.108 -           sodipodi:linespacing="100%"
   1.109 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.110 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.111 -             sodipodi:role="line"
   1.112 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.113 +             xml:space="preserve"><tspan
   1.114 +               id="tspan3521"
   1.115 +               y="220.9987"
   1.116 +               x="-689.91699"
   1.117 +               sodipodi:role="line"
   1.118 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Write Back</tspan></text>
   1.119 +          <rect
   1.120 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.121 +             id="rect3523"
   1.122 +             width="29.62429"
   1.123 +             height="58.297375"
   1.124 +             x="248.55019"
   1.125 +             y="541.71765" />
   1.126 +        </g>
   1.127 +        <g
   1.128 +           transform="translate(97.23188,-354.82812)"
   1.129 +           id="g3573">
   1.130 +          <text
   1.131 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
   1.132 +             sodipodi:linespacing="100%"
   1.133 +             id="text3575"
   1.134 +             y="220.9987"
   1.135 +             x="-689.91699"
   1.136 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.137 +             xml:space="preserve"><tspan
   1.138 +               id="tspan3577"
   1.139 +               y="220.9987"
   1.140 +               x="-689.91699"
   1.141 +               sodipodi:role="line"
   1.142 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Reg</tspan></text>
   1.143 +          <rect
   1.144 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.145 +             id="rect3579"
   1.146 +             width="29.62429"
   1.147 +             height="58.297375"
   1.148 +             x="248.55019"
   1.149 +             y="541.71765" />
   1.150 +        </g>
   1.151 +        <g
   1.152 +           id="g3587"
   1.153 +           transform="translate(57.231876,-354.82812)">
   1.154 +          <text
   1.155 +             xml:space="preserve"
   1.156 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.157               x="-689.91699"
   1.158               y="220.9987"
   1.159 -             id="tspan3521">Write Back</tspan></text>
   1.160 -        <rect
   1.161 -           y="541.71765"
   1.162 -           x="248.55019"
   1.163 -           height="58.297375"
   1.164 -           width="29.62429"
   1.165 -           id="rect3523"
   1.166 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.167 -      </g>
   1.168 -      <g
   1.169 -         id="g3573"
   1.170 -         transform="translate(97.23188,-354.82812)">
   1.171 -        <text
   1.172 -           xml:space="preserve"
   1.173 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.174 -           x="-689.91699"
   1.175 -           y="220.9987"
   1.176 -           id="text3575"
   1.177 -           sodipodi:linespacing="100%"
   1.178 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.179 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.180 -             sodipodi:role="line"
   1.181 -             x="-689.91699"
   1.182 -             y="220.9987"
   1.183 -             id="tspan3577">Reg</tspan></text>
   1.184 -        <rect
   1.185 -           y="541.71765"
   1.186 -           x="248.55019"
   1.187 -           height="58.297375"
   1.188 -           width="29.62429"
   1.189 -           id="rect3579"
   1.190 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.191 -      </g>
   1.192 -      <g
   1.193 -         transform="translate(57.231876,-354.82812)"
   1.194 -         id="g3587">
   1.195 -        <text
   1.196 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
   1.197 -           sodipodi:linespacing="100%"
   1.198 -           id="text3589"
   1.199 -           y="220.9987"
   1.200 -           x="-689.91699"
   1.201 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.202 -           xml:space="preserve"><tspan
   1.203 -             id="tspan3591"
   1.204 +             id="text3589"
   1.205 +             sodipodi:linespacing="100%"
   1.206 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.207 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.208 +               sodipodi:role="line"
   1.209 +               x="-689.91699"
   1.210 +               y="220.9987"
   1.211 +               id="tspan3591">Decode</tspan></text>
   1.212 +          <rect
   1.213 +             y="541.71765"
   1.214 +             x="248.55019"
   1.215 +             height="58.297375"
   1.216 +             width="29.62429"
   1.217 +             id="rect3593"
   1.218 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.219 +        </g>
   1.220 +        <g
   1.221 +           transform="translate(-23.951556,-325.64468)"
   1.222 +           id="g3601">
   1.223 +          <text
   1.224 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
   1.225 +             sodipodi:linespacing="100%"
   1.226 +             id="text3603"
   1.227               y="220.9987"
   1.228               x="-689.91699"
   1.229 -             sodipodi:role="line"
   1.230 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Decode</tspan></text>
   1.231 -        <rect
   1.232 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.233 -           id="rect3593"
   1.234 -           width="29.62429"
   1.235 -           height="58.297375"
   1.236 -           x="248.55019"
   1.237 -           y="541.71765" />
   1.238 -      </g>
   1.239 -      <g
   1.240 -         id="g3601"
   1.241 -         transform="translate(-23.951556,-325.64468)">
   1.242 -        <text
   1.243 -           xml:space="preserve"
   1.244 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.245 -           x="-689.91699"
   1.246 -           y="220.9987"
   1.247 -           id="text3603"
   1.248 -           sodipodi:linespacing="100%"
   1.249 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.250 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.251 -             sodipodi:role="line"
   1.252 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.253 +             xml:space="preserve"><tspan
   1.254 +               id="tspan3605"
   1.255 +               y="220.9987"
   1.256 +               x="-689.91699"
   1.257 +               sodipodi:role="line"
   1.258 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Fetch</tspan></text>
   1.259 +          <rect
   1.260 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.261 +             id="rect3607"
   1.262 +             width="29.62429"
   1.263 +             height="58.297375"
   1.264 +             x="248.55019"
   1.265 +             y="541.71765" />
   1.266 +        </g>
   1.267 +        <g
   1.268 +           transform="translate(-23.951556,-401.64468)"
   1.269 +           id="g3671">
   1.270 +          <text
   1.271 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
   1.272 +             sodipodi:linespacing="100%"
   1.273 +             id="text3673"
   1.274 +             y="216.03394"
   1.275 +             x="-680.24878"
   1.276 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.277 +             xml:space="preserve"><tspan
   1.278 +               id="tspan3675"
   1.279 +               y="216.03394"
   1.280 +               x="-680.24878"
   1.281 +               sodipodi:role="line"
   1.282 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Firmware</tspan><tspan
   1.283 +               id="tspan3618"
   1.284 +               y="227.8649"
   1.285 +               x="-680.24878"
   1.286 +               sodipodi:role="line"
   1.287 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1">Fetch</tspan></text>
   1.288 +          <rect
   1.289 +             style="fill:none;stroke:#000000;stroke-width:1.80063593;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.290 +             id="rect3677"
   1.291 +             width="29.62429"
   1.292 +             height="74.865425"
   1.293 +             x="248.55019"
   1.294 +             y="525.1496" />
   1.295 +        </g>
   1.296 +        <path
   1.297 +           sodipodi:nodetypes="cc"
   1.298 +           inkscape:connector-curvature="0"
   1.299 +           id="path3679"
   1.300 +           d="m 254.39121,186.08734 17.06968,0"
   1.301 +           style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
   1.302 +        <path
   1.303 +           style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
   1.304 +           d="m 254.39121,246.08734 17.06968,0"
   1.305 +           id="path4765"
   1.306 +           inkscape:connector-curvature="0"
   1.307 +           sodipodi:nodetypes="cc" />
   1.308 +        <path
   1.309 +           sodipodi:nodetypes="cc"
   1.310 +           inkscape:connector-curvature="0"
   1.311 +           id="path4767"
   1.312 +           d="m 289.79949,216.08734 17.06968,0"
   1.313 +           style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
   1.314 +        <path
   1.315 +           style="fill:none;stroke:#000000;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#marker12861)"
   1.316 +           d="m 279.2599,167.43416 0,-58.56486 -39.02321,0 0,13.01775"
   1.317 +           id="path4771"
   1.318 +           inkscape:connector-curvature="0"
   1.319 +           sodipodi:nodetypes="cccc" />
   1.320 +        <g
   1.321 +           transform="translate(137.23188,-414.82812)"
   1.322 +           id="g3145">
   1.323 +          <text
   1.324 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"
   1.325 +             sodipodi:linespacing="100%"
   1.326 +             id="text3147"
   1.327 +             y="215.95836"
   1.328               x="-689.91699"
   1.329 -             y="220.9987"
   1.330 -             id="tspan3605">Fetch</tspan></text>
   1.331 -        <rect
   1.332 -           y="541.71765"
   1.333 -           x="248.55019"
   1.334 -           height="58.297375"
   1.335 -           width="29.62429"
   1.336 -           id="rect3607"
   1.337 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.338 -      </g>
   1.339 -      <g
   1.340 -         id="g3671"
   1.341 -         transform="translate(-23.951556,-401.64468)">
   1.342 -        <text
   1.343 -           xml:space="preserve"
   1.344 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.345 -           x="-680.24878"
   1.346 -           y="216.03394"
   1.347 -           id="text3673"
   1.348 -           sodipodi:linespacing="100%"
   1.349 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.350 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.351 -             sodipodi:role="line"
   1.352 -             x="-680.24878"
   1.353 -             y="216.03394"
   1.354 -             id="tspan3675">Firmware</tspan><tspan
   1.355 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.356 -             sodipodi:role="line"
   1.357 -             x="-680.24878"
   1.358 -             y="227.8649"
   1.359 -             id="tspan3618">Fetch</tspan></text>
   1.360 -        <rect
   1.361 -           y="525.1496"
   1.362 -           x="248.55019"
   1.363 -           height="74.865425"
   1.364 -           width="29.62429"
   1.365 -           id="rect3677"
   1.366 -           style="fill:none;stroke:#000000;stroke-width:1.80063593;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.367 -      </g>
   1.368 -      <path
   1.369 -         style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
   1.370 -         d="m 254.39121,186.08734 17.06968,0"
   1.371 -         id="path3679"
   1.372 -         inkscape:connector-curvature="0"
   1.373 -         sodipodi:nodetypes="cc" />
   1.374 -      <path
   1.375 -         sodipodi:nodetypes="cc"
   1.376 -         inkscape:connector-curvature="0"
   1.377 -         id="path4765"
   1.378 -         d="m 254.39121,246.08734 17.06968,0"
   1.379 -         style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none" />
   1.380 -      <path
   1.381 -         style="fill:none;stroke:#000000;stroke-width:1.99999976;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
   1.382 -         d="m 289.79949,216.08734 17.06968,0"
   1.383 -         id="path4767"
   1.384 -         inkscape:connector-curvature="0"
   1.385 -         sodipodi:nodetypes="cc" />
   1.386 -      <path
   1.387 -         sodipodi:nodetypes="cccc"
   1.388 -         inkscape:connector-curvature="0"
   1.389 -         id="path4769"
   1.390 -         d="m 314.7929,186.08999 0,-17.15976 -35.50296,0 0,11.2426"
   1.391 -         style="fill:none;stroke:#000000;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#marker12861);stroke-miterlimit:4;stroke-dasharray:none" />
   1.392 -      <path
   1.393 -         sodipodi:nodetypes="cccc"
   1.394 -         inkscape:connector-curvature="0"
   1.395 -         id="path4771"
   1.396 -         d="m 314.79289,186.08999 0,-76.33136 -74.5562,0 0,13.01775"
   1.397 -         style="fill:none;stroke:#000000;stroke-width:1.5;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#marker12861);stroke-miterlimit:4;stroke-dasharray:none" />
   1.398 -      <g
   1.399 -         id="g3145"
   1.400 -         transform="translate(137.23188,-414.82812)">
   1.401 -        <text
   1.402 -           xml:space="preserve"
   1.403 -           style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.404 -           x="-689.91699"
   1.405 -           y="215.95836"
   1.406 -           id="text3147"
   1.407 -           sodipodi:linespacing="100%"
   1.408 -           transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.409 -             id="tspan3153"
   1.410 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.411 -             sodipodi:role="line"
   1.412 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.413 +             xml:space="preserve"><tspan
   1.414 +               y="215.95836"
   1.415 +               x="-689.91699"
   1.416 +               sodipodi:role="line"
   1.417 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.418 +               id="tspan3153">RT actor-Fn</tspan><tspan
   1.419 +               y="227.78932"
   1.420 +               x="-689.91699"
   1.421 +               sodipodi:role="line"
   1.422 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.423 +               id="tspan3157">accelerator</tspan></text>
   1.424 +          <rect
   1.425 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
   1.426 +             id="rect3151"
   1.427 +             width="29.62429"
   1.428 +             height="58.297375"
   1.429 +             x="248.55019"
   1.430 +             y="541.71765" />
   1.431 +        </g>
   1.432 +        <g
   1.433 +           id="g3338"
   1.434 +           transform="translate(77.23188,-434.82812)">
   1.435 +          <text
   1.436 +             xml:space="preserve"
   1.437 +             style="font-size:13.14551258px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   1.438               x="-689.91699"
   1.439 -             y="215.95836">runtime</tspan><tspan
   1.440 -             id="tspan3157"
   1.441 -             style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.442 -             sodipodi:role="line"
   1.443 -             x="-689.91699"
   1.444 -             y="227.78932">accelerator</tspan></text>
   1.445 -        <rect
   1.446 -           y="541.71765"
   1.447 -           x="248.55019"
   1.448 -           height="58.297375"
   1.449 -           width="29.62429"
   1.450 -           id="rect3151"
   1.451 -           style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.452 +             y="215.95836"
   1.453 +             id="text3340"
   1.454 +             sodipodi:linespacing="100%"
   1.455 +             transform="matrix(0,-0.82746283,1.2085135,0,0,0)"><tspan
   1.456 +               id="tspan3342"
   1.457 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.458 +               sodipodi:role="line"
   1.459 +               x="-689.91699"
   1.460 +               y="215.95836">context</tspan><tspan
   1.461 +               id="tspan3344"
   1.462 +               style="font-size:11.83096123px;text-align:center;text-anchor:middle;fill:#ff0000;fill-opacity:1"
   1.463 +               sodipodi:role="line"
   1.464 +               x="-689.91699"
   1.465 +               y="227.78932">manager</tspan></text>
   1.466 +          <rect
   1.467 +             y="541.71765"
   1.468 +             x="248.55019"
   1.469 +             height="58.297375"
   1.470 +             width="29.62429"
   1.471 +             id="rect3346"
   1.472 +             style="fill:none;stroke:#000000;stroke-width:1.80063581;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
   1.473 +        </g>
   1.474 +        <path
   1.475 +           sodipodi:nodetypes="ccc"
   1.476 +           inkscape:connector-curvature="0"
   1.477 +           id="path3406"
   1.478 +           d="m 326.46804,141.01142 -47.17809,0 0,39.16138"
   1.479 +           style="fill:none;stroke:#000000;stroke-width:1.50000012;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#marker12861)" />
   1.480 +        <path
   1.481 +           style="fill:none;stroke:#000000;stroke-width:1.2021842;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#Arrow2Mstart);marker-end:none"
   1.482 +           d="m 350.59917,185.51694 0,-18.50721"
   1.483 +           id="path3408"
   1.484 +           inkscape:connector-curvature="0"
   1.485 +           sodipodi:nodetypes="cc" />
   1.486 +        <path
   1.487 +           sodipodi:nodetypes="cc"
   1.488 +           inkscape:connector-curvature="0"
   1.489 +           id="path5008"
   1.490 +           d="m 356.67499,140.26334 27.68538,0"
   1.491 +           style="fill:none;stroke:#000000;stroke-width:1.2021842;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#Arrow2Mstart);marker-end:url(#marker12861)" />
   1.492 +        <path
   1.493 +           sodipodi:nodetypes="cc"
   1.494 +           inkscape:connector-curvature="0"
   1.495 +           id="path5084"
   1.496 +           d="m 330.59917,185.51694 0,-18.50721"
   1.497 +           style="fill:none;stroke:#000000;stroke-width:1.2021842;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:none;marker-end:url(#marker12861)" />
   1.498        </g>
   1.499      </g>
   1.500    </g>
     2.1 Binary file 0__Papers/Holistic_Model/Perf_Tune/figures/UCC_concreteness_grid.pdf has changed
     3.1 --- a/0__Papers/Holistic_Model/Perf_Tune/figures/UCC_concreteness_grid.svg	Tue Jun 05 07:28:32 2012 -0700
     3.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/figures/UCC_concreteness_grid.svg	Tue Jun 19 16:18:34 2012 -0700
     3.3 @@ -344,9 +344,9 @@
     3.4       objecttolerance="10"
     3.5       inkscape:pageopacity="0.0"
     3.6       inkscape:pageshadow="2"
     3.7 -     inkscape:zoom="2.03"
     3.8 -     inkscape:cx="204.59135"
     3.9 -     inkscape:cy="568.20413"
    3.10 +     inkscape:zoom="3.31"
    3.11 +     inkscape:cx="127.74405"
    3.12 +     inkscape:cy="588.68751"
    3.13       inkscape:document-units="px"
    3.14       inkscape:current-layer="layer1"
    3.15       showgrid="false"
    3.16 @@ -409,45 +409,45 @@
    3.17         xml:space="preserve"
    3.18         style="font-size:9px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    3.19         x="15.896555"
    3.20 -       y="529.44135"
    3.21 +       y="537.44135"
    3.22         id="text9856"
    3.23         sodipodi:linespacing="100%"><tspan
    3.24           style="font-size:8px;text-align:start;text-anchor:start"
    3.25           id="tspan9858"
    3.26           sodipodi:role="line"
    3.27           x="15.896555"
    3.28 -         y="529.44135">0 -- none: application code alone fully concretizes</tspan><tspan
    3.29 +         y="537.44135">0 -- none: application code alone fully concretizes</tspan><tspan
    3.30           style="font-size:8px;text-align:start;text-anchor:start"
    3.31           sodipodi:role="line"
    3.32           x="15.896555"
    3.33 -         y="537.44135"
    3.34 +         y="545.44135"
    3.35           id="tspan9860" /></text>
    3.36      <path
    3.37         inkscape:connector-curvature="0"
    3.38         sodipodi:nodetypes="cc"
    3.39         id="path3139"
    3.40 -       d="m 43.849847,481.79676 c 38.53453,0.002 67.009883,0.004 67.009883,0.004"
    3.41 -       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)" />
    3.42 +       d="m 74.933541,481.79676 c 43.366969,0.002 75.413299,0.004 75.413299,0.004"
    3.43 +       style="fill:none;stroke:#000000;stroke-width:1.06085134;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" />
    3.44      <path
    3.45 -       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:url(#Arrow2Mend-8)"
    3.46 -       d="m 37.77295,475.70791 c 0.002,-38.53453 0.004,-67.00988 0.004,-67.00988"
    3.47 +       style="fill:none;stroke:#000000;stroke-width:1.05722809;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none"
    3.48 +       d="m 75.77295,482.61185 c 0.002,-43.07125 0.004,-74.89904 0.004,-74.89904"
    3.49         id="path3141"
    3.50         sodipodi:nodetypes="cc"
    3.51         inkscape:connector-curvature="0" />
    3.52      <text
    3.53         sodipodi:linespacing="100%"
    3.54         id="text3297"
    3.55 -       y="537.44135"
    3.56 +       y="545.44135"
    3.57         x="15.896555"
    3.58         style="font-size:9px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    3.59         xml:space="preserve"><tspan
    3.60 -         y="537.44135"
    3.61 +         y="545.44135"
    3.62           x="15.896555"
    3.63           sodipodi:role="line"
    3.64           id="tspan3299"
    3.65           style="font-size:8px;text-align:start;text-anchor:start">1 -- parameters needed in addition to code</tspan><tspan
    3.66           id="tspan3301"
    3.67 -         y="545.44135"
    3.68 +         y="553.44135"
    3.69           x="15.896555"
    3.70           sodipodi:role="line"
    3.71           style="font-size:8px;text-align:start;text-anchor:start" /></text>
    3.72 @@ -455,272 +455,273 @@
    3.73         xml:space="preserve"
    3.74         style="font-size:9px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    3.75         x="15.896555"
    3.76 -       y="545.44135"
    3.77 +       y="553.44135"
    3.78         id="text3303"
    3.79         sodipodi:linespacing="100%"><tspan
    3.80           style="font-size:8px;text-align:start;text-anchor:start"
    3.81           id="tspan3305"
    3.82           sodipodi:role="line"
    3.83           x="15.896555"
    3.84 -         y="545.44135">2 -- input data + (params) + code</tspan><tspan
    3.85 +         y="553.44135">2 -- input data + (params) + code</tspan><tspan
    3.86           style="font-size:8px;text-align:start;text-anchor:start"
    3.87           sodipodi:role="line"
    3.88           x="15.896555"
    3.89 -         y="553.44135"
    3.90 +         y="561.44135"
    3.91           id="tspan3307" /></text>
    3.92      <text
    3.93         sodipodi:linespacing="100%"
    3.94         id="text3309"
    3.95 -       y="553.44141"
    3.96 +       y="561.44141"
    3.97         x="15.896555"
    3.98         style="font-size:9px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
    3.99         xml:space="preserve"><tspan
   3.100 -         y="553.44141"
   3.101 +         y="561.44141"
   3.102           x="15.896555"
   3.103           sodipodi:role="line"
   3.104           id="tspan3311"
   3.105           style="font-size:8px;text-align:start;text-anchor:start">3 -- scheduling decisions + (data) + (params) + code</tspan><tspan
   3.106           id="tspan3313"
   3.107 -         y="561.44141"
   3.108 +         y="569.44141"
   3.109           x="15.896555"
   3.110           sodipodi:role="line"
   3.111           style="font-size:8px;text-align:start;text-anchor:start" /></text>
   3.112      <text
   3.113         xml:space="preserve"
   3.114         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.115 -       x="120.85421"
   3.116 -       y="485.97656"
   3.117 +       x="102.85421"
   3.118 +       y="497.97656"
   3.119         id="text3315"
   3.120         sodipodi:linespacing="100%"><tspan
   3.121           style="font-size:9px;text-align:start;text-anchor:start"
   3.122           id="tspan3317"
   3.123           sodipodi:role="line"
   3.124 -         x="120.85421"
   3.125 -         y="485.97656">Units</tspan><tspan
   3.126 +         x="102.85421"
   3.127 +         y="497.97656">Units</tspan><tspan
   3.128           style="font-size:9px;text-align:start;text-anchor:start"
   3.129           sodipodi:role="line"
   3.130 -         x="120.85421"
   3.131 -         y="494.97656"
   3.132 +         x="102.85421"
   3.133 +         y="506.97656"
   3.134           id="tspan3319" /></text>
   3.135      <text
   3.136         sodipodi:linespacing="100%"
   3.137         id="text3321"
   3.138 -       y="405.19916"
   3.139 -       x="16.290943"
   3.140 +       y="62.176929"
   3.141 +       x="-464.64716"
   3.142         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.143 -       xml:space="preserve"><tspan
   3.144 -         y="405.19916"
   3.145 -         x="16.290943"
   3.146 +       xml:space="preserve"
   3.147 +       transform="matrix(0,-1,1,0,0,0)"><tspan
   3.148 +         y="62.176929"
   3.149 +         x="-464.64716"
   3.150           sodipodi:role="line"
   3.151           id="tspan3323"
   3.152           style="font-size:9px;text-align:start;text-anchor:start">Constraints</tspan><tspan
   3.153           id="tspan3325"
   3.154 -         y="414.19916"
   3.155 -         x="16.290943"
   3.156 +         y="71.176926"
   3.157 +         x="-464.64716"
   3.158           sodipodi:role="line"
   3.159           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.160      <text
   3.161         sodipodi:linespacing="100%"
   3.162         id="text3363"
   3.163         y="490.02182"
   3.164 -       x="41.620483"
   3.165 +       x="82.6548"
   3.166         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.167         xml:space="preserve"><tspan
   3.168           y="490.02182"
   3.169 -         x="41.620483"
   3.170 +         x="82.6548"
   3.171           sodipodi:role="line"
   3.172           id="tspan3365"
   3.173           style="font-size:9px;text-align:start;text-anchor:start">0</tspan><tspan
   3.174           id="tspan3367"
   3.175           y="499.02182"
   3.176 -         x="41.620483"
   3.177 +         x="82.6548"
   3.178           sodipodi:role="line"
   3.179           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.180      <text
   3.181         xml:space="preserve"
   3.182         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.183 -       x="61.620483"
   3.184 +       x="101.54072"
   3.185         y="490.02182"
   3.186         id="text3369"
   3.187         sodipodi:linespacing="100%"><tspan
   3.188           style="font-size:9px;text-align:start;text-anchor:start"
   3.189           id="tspan3371"
   3.190           sodipodi:role="line"
   3.191 -         x="61.620483"
   3.192 +         x="101.54072"
   3.193           y="490.02182">1</tspan><tspan
   3.194           style="font-size:9px;text-align:start;text-anchor:start"
   3.195           sodipodi:role="line"
   3.196 -         x="61.620483"
   3.197 +         x="101.54072"
   3.198           y="499.02182"
   3.199           id="tspan3373" /></text>
   3.200      <text
   3.201         sodipodi:linespacing="100%"
   3.202         id="text3375"
   3.203         y="490.02182"
   3.204 -       x="81.620483"
   3.205 +       x="119.74769"
   3.206         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.207         xml:space="preserve"><tspan
   3.208           y="490.02182"
   3.209 -         x="81.620483"
   3.210 +         x="119.74769"
   3.211           sodipodi:role="line"
   3.212           id="tspan3377"
   3.213           style="font-size:9px;text-align:start;text-anchor:start">2</tspan><tspan
   3.214           id="tspan3379"
   3.215           y="499.02182"
   3.216 -         x="81.620483"
   3.217 +         x="119.74769"
   3.218           sodipodi:role="line"
   3.219           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.220      <text
   3.221         xml:space="preserve"
   3.222         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.223 -       x="101.62048"
   3.224 +       x="138.19196"
   3.225         y="490.02182"
   3.226         id="text3381"
   3.227         sodipodi:linespacing="100%"><tspan
   3.228           style="font-size:9px;text-align:start;text-anchor:start"
   3.229           id="tspan3383"
   3.230           sodipodi:role="line"
   3.231 -         x="101.62048"
   3.232 +         x="138.19196"
   3.233           y="490.02182">3</tspan><tspan
   3.234           style="font-size:9px;text-align:start;text-anchor:start"
   3.235           sodipodi:role="line"
   3.236 -         x="101.62048"
   3.237 +         x="138.19196"
   3.238           y="499.02182"
   3.239           id="tspan3385" /></text>
   3.240      <text
   3.241         sodipodi:linespacing="100%"
   3.242         id="text3387"
   3.243 -       y="478.02182"
   3.244 -       x="29.620483"
   3.245 +       y="475.78091"
   3.246 +       x="67.620483"
   3.247         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.248         xml:space="preserve"><tspan
   3.249 -         y="478.02182"
   3.250 -         x="29.620483"
   3.251 +         y="475.78091"
   3.252 +         x="67.620483"
   3.253           sodipodi:role="line"
   3.254           id="tspan3389"
   3.255           style="font-size:9px;text-align:start;text-anchor:start">0</tspan><tspan
   3.256           id="tspan3391"
   3.257 -         y="487.02182"
   3.258 -         x="29.620483"
   3.259 +         y="484.78091"
   3.260 +         x="67.620483"
   3.261           sodipodi:role="line"
   3.262           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.263      <text
   3.264         xml:space="preserve"
   3.265         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.266 -       x="29.620483"
   3.267 -       y="458.02182"
   3.268 +       x="67.620483"
   3.269 +       y="457.28091"
   3.270         id="text3393"
   3.271         sodipodi:linespacing="100%"><tspan
   3.272           style="font-size:9px;text-align:start;text-anchor:start"
   3.273           id="tspan3395"
   3.274           sodipodi:role="line"
   3.275 -         x="29.620483"
   3.276 -         y="458.02182">1</tspan><tspan
   3.277 +         x="67.620483"
   3.278 +         y="457.28091">1</tspan><tspan
   3.279           style="font-size:9px;text-align:start;text-anchor:start"
   3.280           sodipodi:role="line"
   3.281 -         x="29.620483"
   3.282 -         y="467.02182"
   3.283 +         x="67.620483"
   3.284 +         y="466.28091"
   3.285           id="tspan3397" /></text>
   3.286      <text
   3.287         sodipodi:linespacing="100%"
   3.288         id="text3399"
   3.289 -       y="438.02182"
   3.290 -       x="29.620483"
   3.291 +       y="438.82486"
   3.292 +       x="67.620483"
   3.293         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.294         xml:space="preserve"><tspan
   3.295 -         y="438.02182"
   3.296 -         x="29.620483"
   3.297 +         y="438.82486"
   3.298 +         x="67.620483"
   3.299           sodipodi:role="line"
   3.300           id="tspan3401"
   3.301           style="font-size:9px;text-align:start;text-anchor:start">2</tspan><tspan
   3.302           id="tspan3403"
   3.303 -         y="447.02182"
   3.304 -         x="29.620483"
   3.305 +         y="447.82486"
   3.306 +         x="67.620483"
   3.307           sodipodi:role="line"
   3.308           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.309      <text
   3.310         xml:space="preserve"
   3.311         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.312 -       x="29.620483"
   3.313 -       y="418.02182"
   3.314 +       x="67.620483"
   3.315 +       y="420.28091"
   3.316         id="text3405"
   3.317         sodipodi:linespacing="100%"><tspan
   3.318           style="font-size:9px;text-align:start;text-anchor:start"
   3.319           id="tspan3407"
   3.320           sodipodi:role="line"
   3.321 -         x="29.620483"
   3.322 -         y="418.02182">3</tspan><tspan
   3.323 +         x="67.620483"
   3.324 +         y="420.28091">3</tspan><tspan
   3.325           style="font-size:9px;text-align:start;text-anchor:start"
   3.326           sodipodi:role="line"
   3.327 -         x="29.620483"
   3.328 -         y="427.02182"
   3.329 +         x="67.620483"
   3.330 +         y="429.28091"
   3.331           id="tspan3409" /></text>
   3.332      <text
   3.333         xml:space="preserve"
   3.334         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.335 -       x="41.620483"
   3.336 -       y="476.02182"
   3.337 +       x="79.620483"
   3.338 +       y="475.03659"
   3.339         id="text3411"
   3.340         sodipodi:linespacing="100%"><tspan
   3.341           style="font-size:9px;text-align:start;text-anchor:start"
   3.342           id="tspan3413"
   3.343           sodipodi:role="line"
   3.344 -         x="41.620483"
   3.345 -         y="476.02182">A</tspan><tspan
   3.346 +         x="79.620483"
   3.347 +         y="475.03659">A</tspan><tspan
   3.348           style="font-size:9px;text-align:start;text-anchor:start"
   3.349           sodipodi:role="line"
   3.350 -         x="41.620483"
   3.351 -         y="485.02182"
   3.352 +         x="79.620483"
   3.353 +         y="484.03659"
   3.354           id="tspan3415" /></text>
   3.355      <text
   3.356         sodipodi:linespacing="100%"
   3.357         id="text3417"
   3.358 -       y="438.02182"
   3.359 -       x="41.620483"
   3.360 +       y="437.03659"
   3.361 +       x="79.620483"
   3.362         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.363         xml:space="preserve"><tspan
   3.364 -         y="438.02182"
   3.365 -         x="41.620483"
   3.366 +         y="437.03659"
   3.367 +         x="79.620483"
   3.368           sodipodi:role="line"
   3.369           id="tspan3419"
   3.370           style="font-size:9px;text-align:start;text-anchor:start">C</tspan><tspan
   3.371           id="tspan3421"
   3.372 -         y="447.02182"
   3.373 -         x="41.620483"
   3.374 +         y="446.03659"
   3.375 +         x="79.620483"
   3.376           sodipodi:role="line"
   3.377           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.378      <text
   3.379         xml:space="preserve"
   3.380         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.381 -       x="101.62048"
   3.382 -       y="420.02182"
   3.383 +       x="139.62048"
   3.384 +       y="419.03659"
   3.385         id="text3423"
   3.386         sodipodi:linespacing="100%"><tspan
   3.387           style="font-size:9px;text-align:start;text-anchor:start"
   3.388           id="tspan3425"
   3.389           sodipodi:role="line"
   3.390 -         x="101.62048"
   3.391 -         y="420.02182">D</tspan><tspan
   3.392 +         x="139.62048"
   3.393 +         y="419.03659">D</tspan><tspan
   3.394           style="font-size:9px;text-align:start;text-anchor:start"
   3.395           sodipodi:role="line"
   3.396 -         x="101.62048"
   3.397 -         y="429.02182"
   3.398 +         x="139.62048"
   3.399 +         y="428.03659"
   3.400           id="tspan3427" /></text>
   3.401      <text
   3.402         sodipodi:linespacing="100%"
   3.403         id="text3429"
   3.404 -       y="458.02182"
   3.405 -       x="81.620483"
   3.406 +       y="457.03659"
   3.407 +       x="119.62048"
   3.408         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.409         xml:space="preserve"><tspan
   3.410 -         y="458.02182"
   3.411 -         x="81.620483"
   3.412 +         y="457.03659"
   3.413 +         x="119.62048"
   3.414           sodipodi:role="line"
   3.415           id="tspan3431"
   3.416           style="font-size:9px;text-align:start;text-anchor:start">B</tspan><tspan
   3.417           id="tspan3433"
   3.418 -         y="467.02182"
   3.419 -         x="81.620483"
   3.420 +         y="466.03659"
   3.421 +         x="119.62048"
   3.422           sodipodi:role="line"
   3.423           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.424      <text
   3.425 @@ -734,33 +735,86 @@
   3.426           x="15.896555"
   3.427           sodipodi:role="line"
   3.428           id="tspan3437"
   3.429 -         style="font-size:9px;text-align:start;text-anchor:start">Numbers indicate which factors have to be</tspan><tspan
   3.430 +         style="font-size:9px;text-align:start;text-anchor:start">Numbers indicate which information has to be</tspan><tspan
   3.431           y="516.44135"
   3.432           x="15.896555"
   3.433           sodipodi:role="line"
   3.434           style="font-size:9px;text-align:start;text-anchor:start"
   3.435 -         id="tspan3447">added in order to move the UCC to concrete:</tspan><tspan
   3.436 +         id="tspan3447">added for that dimension of UCC in order to</tspan><tspan
   3.437 +         y="525.44135"
   3.438 +         x="15.896555"
   3.439 +         sodipodi:role="line"
   3.440 +         style="font-size:9px;text-align:start;text-anchor:start"
   3.441 +         id="tspan4763">make it fully concrete:</tspan><tspan
   3.442           id="tspan3439"
   3.443 -         y="525.44135"
   3.444 +         y="534.44135"
   3.445           x="15.896555"
   3.446           sodipodi:role="line"
   3.447           style="font-size:9px;text-align:start;text-anchor:start" /></text>
   3.448      <text
   3.449         xml:space="preserve"
   3.450         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:start;line-height:100%;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Trebuchet MS;-inkscape-font-specification:Trebuchet MS"
   3.451 -       x="103.12312"
   3.452 +       x="111.12312"
   3.453         y="385.44135"
   3.454         id="text3441"
   3.455         sodipodi:linespacing="100%"><tspan
   3.456           style="font-size:10px;text-align:center;text-anchor:middle"
   3.457           sodipodi:role="line"
   3.458 -         x="103.12312"
   3.459 +         x="111.12312"
   3.460           y="385.44135"
   3.461 -         id="tspan3445">Factors Needed by Given UCC in order to</tspan><tspan
   3.462 +         id="tspan3445">Information Needed for Given UCC</tspan><tspan
   3.463           style="font-size:10px;text-align:center;text-anchor:middle"
   3.464           sodipodi:role="line"
   3.465 -         x="103.12312"
   3.466 +         x="111.12312"
   3.467           y="395.44135"
   3.468 -         id="tspan3459">Make it Fully Concrete</tspan></text>
   3.469 +         id="tspan3459">in Order to Make it Fully Concrete</tspan></text>
   3.470 +    <path
   3.471 +       inkscape:connector-curvature="0"
   3.472 +       sodipodi:nodetypes="cc"
   3.473 +       id="path4743"
   3.474 +       d="m 94.265561,480.64141 c 0.002,-41.93813 0.004,-72.9286 0.004,-72.9286"
   3.475 +       style="fill:none;stroke:#000000;stroke-width:1.04322863;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" />
   3.476 +    <path
   3.477 +       style="fill:none;stroke:#000000;stroke-width:1.04322863;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none"
   3.478 +       d="m 112.75817,480.64141 c 0.002,-41.93813 0.004,-72.9286 0.004,-72.9286"
   3.479 +       id="path4745"
   3.480 +       sodipodi:nodetypes="cc"
   3.481 +       inkscape:connector-curvature="0" />
   3.482 +    <path
   3.483 +       inkscape:connector-curvature="0"
   3.484 +       sodipodi:nodetypes="cc"
   3.485 +       id="path4747"
   3.486 +       d="m 149.74339,480.64141 c 0.002,-41.93813 0.004,-72.9286 0.004,-72.9286"
   3.487 +       style="fill:none;stroke:#000000;stroke-width:1.04322863;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" />
   3.488 +    <path
   3.489 +       style="fill:none;stroke:#000000;stroke-width:1.04322863;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none"
   3.490 +       d="m 131.25078,480.64141 c 0.002,-41.93813 0.004,-72.9286 0.004,-72.9286"
   3.491 +       id="path4749"
   3.492 +       sodipodi:nodetypes="cc"
   3.493 +       inkscape:connector-curvature="0" />
   3.494 +    <path
   3.495 +       style="fill:none;stroke:#000000;stroke-width:1.06085134;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none"
   3.496 +       d="m 74.933541,463.29676 c 43.366969,0.002 75.413299,0.004 75.413299,0.004"
   3.497 +       id="path4751"
   3.498 +       sodipodi:nodetypes="cc"
   3.499 +       inkscape:connector-curvature="0" />
   3.500 +    <path
   3.501 +       inkscape:connector-curvature="0"
   3.502 +       sodipodi:nodetypes="cc"
   3.503 +       id="path4753"
   3.504 +       d="m 74.933541,444.79676 c 43.366969,0.002 75.413299,0.004 75.413299,0.004"
   3.505 +       style="fill:none;stroke:#000000;stroke-width:1.06085134;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" />
   3.506 +    <path
   3.507 +       style="fill:none;stroke:#000000;stroke-width:1.06085134;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none"
   3.508 +       d="m 74.933541,426.29676 c 43.366969,0.002 75.413299,0.004 75.413299,0.004"
   3.509 +       id="path4755"
   3.510 +       sodipodi:nodetypes="cc"
   3.511 +       inkscape:connector-curvature="0" />
   3.512 +    <path
   3.513 +       inkscape:connector-curvature="0"
   3.514 +       sodipodi:nodetypes="cc"
   3.515 +       id="path4757"
   3.516 +       d="m 74.933541,407.79676 c 43.366969,0.002 75.413299,0.004 75.413299,0.004"
   3.517 +       style="fill:none;stroke:#000000;stroke-width:1.06085134;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;marker-end:none" />
   3.518    </g>
   3.519  </svg>
     4.1 --- a/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex	Tue Jun 05 07:28:32 2012 -0700
     4.2 +++ b/0__Papers/Holistic_Model/Perf_Tune/latex/Holistic_Perf_Tuning.tex	Tue Jun 19 16:18:34 2012 -0700
     4.3 @@ -293,7 +293,7 @@
     4.4          TU Berlin
     4.5  \and
     4.6          Sean Halle\\
     4.7 -        TU Berlin
     4.8 +        Open Source Research Institute
     4.9  \and
    4.10          Ben Juurlink\\
    4.11          TU Berlin
    4.12 @@ -311,12 +311,15 @@
    4.13  
    4.14  We describe an approach that relies upon a new model of parallel computation to link performance loss to its cause, whether that be in the hardware, runtime, or application code. The visualizations produced clearly identify idle cores, and tie the idleness to causal interactions within the runtime and hardware, and from there to the parallelism constructs that constrained the runtime and hardware behavior.
    4.15   
    4.16 -
    4.17 -
    4.18  This is implemented for multi-core hardware, and we walk through a tuning session on a large multi-core machine to illustrate how performance loss is identified and how hypotheses for the cause are generated. We also give a concise description of the implementation and the computation model. \end{abstract}
    4.19  
    4.20  
    4.21 +
    4.22 +
    4.23 +
    4.24 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    4.25  \section{Introduction and Motivation}
    4.26 +\label{sec:intro}
    4.27  
    4.28  Performance visualizations and tuning tools for parallel programs are critical to achieving good performance, and yet current solutions leave something to be desired. All too often, performance tuning consists of staring at abstract views of usage or statistics, trying to \emph{guess} the cause of performance loss. Despite many different views of performance measurements, such as frequency by line of code, message sends and receives, and core usage timelines, the user doesn't know why a function runs in a particular spot on a particular core, nor whether that is desired behavior or erroneous behavior.
    4.29  
    4.30 @@ -337,63 +340,128 @@
    4.31  In the following, we will describe our model of computation, which drives the instrumentation and visualization, and links the aspects of performance to the collected information.
    4.32  The usage of the model and visualization is illustrated with a story line of  performance tuning a standard parallel application on a large multi-core system.
    4.33  
    4.34 -We start with background on performance tuning and an overview of previous approaches in section \ref{related}. We shall introduce our model through a case study in section \ref{casestudy}, and then expand on the theory behind it in section \ref{theory}. Section \ref{sec:Implementation} will tie the model to implementation details. Finally, we will conclude in section \ref{conclusion}.
    4.35 +We start with background on performance tuning and an overview of previous approaches in section \ref{sec:related}. We shall introduce our model through a case study in section \ref{sec:casestudy}, and then expand on the theory behind it in section \ref{sec:theory}. Section \ref{sec:Implementation} will tie the model to implementation details. Finally, we will conclude in section \ref{sec:conclusion}.
    4.36  
    4.37  
    4.38 +
    4.39 +
    4.40 +
    4.41 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    4.42  \section{Background and Related Work}
    4.43 -\label{related}
    4.44 +\label{sec:related}
    4.45  
    4.46 -Performance tuning is an iterative process. The programmer usually has some expectation of the performance of her program, and takes measurements during execution that are then compared to the desired outcome. A mental model, constructed through experience and knowledge of the mechanics of execution, is then used to generate a hypothesis explaining any discrepancies between the measurement and expectations. This hypothesis is then linked, again through a mental model, to things within the programmer's control, to suggest a change to make to the code. The modified code is then run again, and these steps are repeated until the programmer is satisfied with the performance of the program.
    4.47 +A quick review of the process of performance tuning will provide much needed context for the shortcomings of other tools.
    4.48  
    4.49 -To understand the performance of parallel programs, more than just runtime measurements are important. Context for those measurements must also be gathered, which includes the characteristics of the units of work that are scheduled onto resources, as well as the constraints on scheduling those. It is also crucial to record during the run not only measurements of when the hardware was busy doing what, but also the scheduling decisions that affected this hardware usage.
    4.50 + Performance tuning is an iterative process that involves a mental model. The programmer usually has some expectation of the performance of her program, and takes measurements during execution that are then compared to the desired outcome. A mental model, constructed through experience and knowledge of the mechanics of execution, is then used to generate a hypothesis explaining any discrepancies between the measurement and expectations. This hypothesis is then linked, again through a mental model, to things within the programmer's control, to suggest a change to make to the code. The modified code is then run again, and these steps are repeated until the programmer is satisfied with the performance of the program.
    4.51  
    4.52 -Based on this pattern of performance tuning, we evaluate the advantages and shortcomings of different tools' representations of an application and its performance. The first four approaches are found, sometimes in combinations, in most classic performance evaluation tools, while the fifth is starting to emerge, enabled by the growing adoption of task-based programming languages in recent years.
    4.53 +For parallel programs, context for measurements must be gathered that relate them to the mental model. These include  identification of the units of work that are scheduled onto resources, as well as the constraints on scheduling those. It is also crucial to record during the run not only measurements of when the hardware was busy doing what, but also the scheduling decisions that affected this hardware usage.
    4.54  
    4.55 -%%
    4.56 +With this in mind, we evaluate the advantages and shortcomings of five categories of models used for performance tuning. The first four approaches are found, sometimes in combinations, in most classic performance evaluation tools, while the fifth is starting to emerge, enabled by the growing adoption of task-based programming languages in recent years.
    4.57 +
    4.58 +The commonality among the four classic approaches is either the lack of a  model of parallel computation, or a poorly suited one.  
    4.59 +
    4.60 +The early tools also suffered from applications that effectively implemented the runtime as part of the application code. Both MPI and threads effectively force implementing a runtime system in the application. In such a case, the units of work are implied in the code, and difficult for tools to recognize. Likewise, constraints on scheduling are enforced by the code, but never stated in any explicit form. 
    4.61 +
    4.62  
    4.63  \subsection{Thread-model based Approaches}
    4.64 -Most of the older more established tools come from the threads world, and conceive of the application as a processor that performs actions, but don't include the concept of application-defined tasks nor constraints on them. This makes them unable to directly connect statistics they gather to application features.  The lack of connection forces the user to guess at what aspect of the code is responsible for observed performance.
    4.65 +Most of the older more established tools come from the threads world, and conceive of the application as a collection of virtual processors that perform actions, but don't include the concept of application-defined tasks nor constraints on them. This makes them unable to directly connect statistics they gather to specific application features.  The lack of connection forces the user to guess at what aspect of the code is responsible for observed performance.
    4.66  
    4.67 +For example, Tau [] is  representative of the thread-centric approach, and a highly cited, older, system for performance tuning parallel applications. It integrates many data sources, and has rich displays. However it models cores and memories and thread contexts, with actions taken on or by each, with no well defined concept of unit of work. What it is missing is the concept:  tasks, constraints on them, and scheduling choices.
    4.68  
    4.69 +\subsection{Event-centric approach}
    4.70  
    4.71 +Systems that model parallel computation as a collection of events are well represented by Paradyn[], another highly cited classic performance tuning tool.  Its model of computation is based on events, both the timing of events and counts of events.
    4.72  
    4.73 -For example, Tau is a highly cited older system for performance tuning parallel applications, which is representative of thread-centric approaches. It integrates many data sources, and has rich displays. However its model was cores and memories and thread contexts, with actions taken on or by each.
    4.74 + It has a system for user-supplied instrumentation to collect event information and it has a hypotheses mechanism that protects the user from having to write custom code to test their hypotheses. However, the hypotheses are in terms of the timing and counts of events, not the parallel computation relevant information of units of scheduled work and the scheduling decisions made on those. 
    4.75  
    4.76 - It had no well defined concept of unit of work, instead its concept was ``which virtual processor (thread) is running?" It had the concept of scheduling such virtual processors, but this scheduling is disconnected from the application, leaving the user with little or no understanding of what to do to their code to get better behavior. What it is missing is the concept:  tasks, constraints on them, and scheduling choices.
    4.77 +\subsection{Message-centric approach}
    4.78  
    4.79 -\subsection{Application-intensive approach}
    4.80 -
    4.81 -Another highly cited classic performance tuning system is Paradyn[], which is representative of systems that model parallel computation as a collection of events. It is meant for applications that run for several days on multi-thousand node clusters. Its model of computation is based on events, both the timing of events and counts of events. It has a system for user-supplied instrumentation to collect event information and it has a hypotheses mechanism that protects the user from having to write custom code to test their hypotheses. However, the hypotheses are in terms of the timing and counts of events, not the parallel computation relevant information of units of scheduled work and the scheduling decisions made on those. 
    4.82 -
    4.83 -\subsection{Event driven approach}
    4.84 -
    4.85 -Paragraph also follows an event-based model, and represents the large collection of simpler tools that instrument the MPI library. It shows whether cores are busy, and indicates communication overhead, but lacks any features that tie the communication pattern realized to application code features, which are what is under programmer control. It also fails to show runtime overhead, and which portions of idle time are caused by runtime internal constraints.
    4.86 +Paragraph also follows an event-based model, but represents the large collection of simpler tools that instrument the MPI or other message-passing library. It shows whether cores are busy, and indicates communication overhead, but lacks an underlying computation model to tie the communication pattern realized to application code features, which are what is under programmer control. It also fails to show runtime overhead, and which portions of idle time are caused by runtime internal constraints.
    4.87  
    4.88  Paraver and Vampir are just painting tools that take event measurements and paint them on the screen.
    4.89  
    4.90  \subsection{Performance-counter approaches}
    4.91 -Other approaches, such as VTune, POPI, and so forth concentrate on performance counter data to identify hot-spots and potential false-sharing.  These suffer from the same lack of encompassing computation model, leaving the user to guess at what might be the cause of measured numbers.  They do a good job of saying that something might be wrong, but a poor job of pointing to what is causing the problem, and hence leave the user baffled as to what to change in their code to get better performance.
    4.92 +Performance-counter approaches, such as VTune, POPI, and so forth concentrate on identifying hot-spots and potential false-sharing.  These suffer from a lack of encompassing computation model, leaving the user to guess at what might be the cause of measured numbers.  They do a good job of saying that something might be wrong, but a poor job of pointing to what is causing the problem, and hence leave the user baffled as to what to change in their code to get better performance.
    4.93  
    4.94  \subsection{Newer approaches}
    4.95 -The commonality among the classic approaches is the lack of a  model of parallel computation. 
    4.96 + 
    4.97  
    4.98 -One difficulty faced by early tools is that  parallel applications written in MPI or threads effectively end up implementing a runtime system in the application code. In such a case, the units of work are implied in the code, and difficult for tools to recognize. Likewise, constraints on scheduling are enforced by the code, but never stated in any explicit form. 
    4.99 + These shortcomings are rectified by recent parallel languages such as CnC[] and StarSs[], which cleanly identify tasks and the constraints on them. The people who develop such a language also develop  tools to go with it. 
   4.100  
   4.101 - These shortcomings are rectified by recent parallel languages such as CnC and StarSs, which cleanly identify tasks and the constraints on them. The people who develop such a language also develop language-specific tools to go with it. 
   4.102 -
   4.103 -The StarSs tool tries to simplify the view for the user. It doesn't give performance information directly, but instead identifies tasks and tells the user whether it thinks the task size is too small, just right, or    tool big. Too small has too much runtime overhead, while too big has too few tasks to keep the cores busy. It makes task one color if too short, another if just right, and a third if too long.
   4.104 +The StarSs tool[]\ tries to simplify the view for the user. It doesn't give performance information directly, but instead identifies tasks and tells the user whether it thinks the task size is too small, just right, or    tool big. Too small has too much runtime overhead, while too big has too few tasks to keep the cores busy.
   4.105  
   4.106  This is a step in the right direction, but it doesn't include effects of code choices on communication, and gives the programmer only limited information. Subtle and complex interactions between code, runtime, and hardware are often to blame for performance loss, and the limited information supplied here is no help with such complexities.
   4.107  
   4.108  
   4.109 +
   4.110 +
   4.111 +
   4.112 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   4.113  \section{Illustrative Story of Performance Tuning}
   4.114 -\label{casestudy}
   4.115 +\label{sec:casestudy}
   4.116 +
   4.117 +In this section, we illustrate the benefits of connecting measurements to an appropriate computation model by walking through a typical performance tuning session. It shows the features of our approach in action, and indicates how competing tools' lack of those features makes the work more difficult.
   4.118 +
   4.119 +We first describe the program and language used, and then the features of our visualization. After this preparation, we show a sequence of the visualizations. In each,  we point out how the  performance loss is identified, and which visual features suggest the hypothesis for the cause of the loss.  
   4.120 +
   4.121 +\subsection{The Application, and Target Hardware}
   4.122 +
   4.123 +In our session, we wish to tune a standard program that the reader has likely already experienced attempting to performance tune, and/or knows well. The best example is likely matrix multiply, with which the reader should be familiar, allowing concentration on the tool without distraction about the application. 
   4.124 +
   4.125 +We run it on a machine with 4 sockets by 10 cores each, for a total of 40 physical cores. They are Intel WestmereEx cores running at 3.0GHz, with TurboBoost turned off. 
   4.126 +
   4.127 +The application code uses language features to create virtual processors (VP). The first VP created divides the work into a number of pieces and creates a new VP for each piece. 
   4.128 +
   4.129 +How many pieces is set by a tuning parameter in the code, and the number of cores. It uses language features to distribute the VPs  across the cores, initially in a round-robin fashion.
   4.130 +
   4.131 +It then creates   a results VP that receives a partial-result from each piece and accumulates the results. The original divider VP  then waits for the results VP to indicate completion, after which the language  shuts down.
   4.132 +
   4.133 +The language used is SSR, which is based on rendez-vous style send and receive operations made between virtual processors (VPs). It has commands for creating and destroying VPs, and three kinds of send-receive paired operations. 
   4.134 +
   4.135 +The first, \emph{send\_from\_to} specifies a specific sender and specific receiver. It is used by the results VP to tell the divider VP that the work is complete. The second, \emph{send\_of\_type\_to}, specifies a specific receiver, but the sender is anonymous, which increases flexibility while maintaining some control over scope. This  is used by the VPs doing the pieces to send their partial-result to the results processor. The third kind, \emph{send\_of\_type}, only specifies the type, and so acts as a global communication channel; this is not used in our application.
   4.136 +
   4.137 + The language also includes a \emph{singleton} construct that designates a piece of code as to be executed only once, which we use to  rearrange and copy data to get better cache behavior. A given copy is shared by several virtual processors on different cores, but the copy only needs to be performed once. 
   4.138 +
   4.139 +Miscellaneous performance constructs are also available, such as one to force which core a virtual processor is assigned to. We use this in our example program to control scheduling.
   4.140 +
   4.141 +A note on terminology: the word ``task''  has acquired multiple  meanings in the literature, making it a vague term. We often use, instead, the term  ``work-unit'', which is defined precisely as the trace-segment performed on a core, between two successive scheduling events, plus the set of datums consumed by that trace segment. The word task often maps well onto this precise definition, and we use both words, but mean the precise work-unit definition when we say task.
   4.142 +
   4.143 +
   4.144 +
   4.145 +\subsection{The Visualizations}
   4.146 +\label{subsec:visualization_def}
   4.147 + The first visualization is what we refer to as a scheduling consequence graph (SCG), or just consequence graph (CG). It depicts the scheduling operations performed by the runtime, and the consequent usage of the cores. 
   4.148 +
   4.149 +The second visualization depicts constraints on those scheduling decisions that come from the application, such as dependencies in the code. These limit the choices the runtime is allowed to make. We call this the Unit \& Constraint Collection, or UCC. 
   4.150 +
   4.151 +The UCC shows only application-derived information, as opposed to the consequence graph, which combines the \textit{use} of the UCC-depicted constraints with runtime-imposed dependencies and hardware-imposed constraints.  Hence, the UCC states the degrees of freedom enabled by the application, while the consequence graph states how those were made use of, by a particular runtime on particular hardware.
   4.152 +
   4.153 +Fig X shows a consequence graph (CG), stylized for purposes of explanation. It is composed of a number of columns, one for each core. A column represents time,  with early at the top, increasing as one goes down, measured in clock cycles. It is broken into blocks, each representing the time accounted to one work-unit. Each block is further divided into regions, each a different color which indicates the kind of activity the core was engaged in during that  region's time-span.  
   4.154 +  
   4.155 +The kinds of activities are defined by the computation model that underlies the visualization. The first is the work of a work-unit, represented by a blue-to-red region. It includes time stalled due to cache misses. The color indicates  intensity of cache misses, with pure red representing at or above the maximum misses per instruction, and pure blue the minimum.  The max and min are set in the tool that generates the visualization.
   4.156 +
   4.157 +  The second kind of activity is runtime overhead, represented by a gray region. This is the overhead spent on that particular work-unit. When desired by the viewer, it is further broken into pieces representing activities inside the runtime. These may include acquisition of a lock on shared semantic state, time spent on constraints determining readiness of the work-unit, on deciding which ready one to assign to which hardware, and time spent switching from virtual processor, to the runtime, and back. In this paper, we show all runtime overhead the same way, however in other circumstances a breakdown can be key to understanding interaction between runtime and application. 
   4.158 + 
   4.159 +
   4.160 +The other type of visual feature seen in Fig X is lines. Each represents a construct that influenced scheduling.   The line depicts  two things: a constraint and a decision, both inside the runtime. The constraint was satisfied, which made the   decision possible, choosing which core to do the work on. 
   4.161 +
   4.162 +In general, other kinds of lines may also be drawn, representing other kinds of interactions that affect core usage. For example,  visualization can be turned on for  the internal runtime constraint, that only one core at a time may access shared constraint and scheduling state. This appears as additional lines linking the gray runtime regions of blocks. In this paper, visualization is turned off for such minor interactions. 
   4.163 +
   4.164 +Two work-unit (blue-to-red) blocks that go in sequence and have no parallelism construct constraints often have a causal dependency between them, due to the semantics of the base language. These are turned off, but can be checked via the link to the code.
   4.165 + 
   4.166 +Many different orderings could also have been validly chosen. Which scheduler choices are valid is determined by three kinds of constraints: the application code constraints, hardware constraints, and runtime implementation imposed constraints. 
   4.167 +
   4.168 +Returning to Fig \ref{story}, the lines in red, orange, and green represent application-code constraints that each tie two work-units together. The color represents the parallelism construct that imposed that kind of constraint. Red is creation of a virtual processor, green is the many-to-one \texttt{send\_of\_type\_to}, and orange is the singleton construct. For better visibility, only constraints that cross cores are shown.
   4.169 +
   4.170 +The columns represent each core, and the vertical axis represents the progress of the program execution, starting from the top. This gives at a glance the total runtime (height), level of parallelism during the run (number of filled columns at a specific height), and processor utilization (blue-covered area vs. white). All figures are at the same scale, so they can be compared directly.
   4.171 +
   4.172 +
   4.173  
   4.174  \begin{figure*}[t!]
   4.175    \begin{minipage}[b]{0.2\textwidth}
   4.176          \subfloat[Original]
   4.177      {\quad\quad \includegraphics[scale=0.015]{../figures/192.pdf} \quad
   4.178 -    \label{story:a}}\quad
   4.179 +    \label{fig:story:a}}\quad
   4.180    \end{minipage}
   4.181    \begin{minipage}[b]{0.7\textwidth}
   4.182    % \subfloat[]
   4.183 @@ -437,66 +505,15 @@
   4.184  \label{story}
   4.185  \end{figure*}
   4.186  
   4.187 -In this section, we show our approach being used during a typical performance tuning session, to see how its features give benefit, and how competing tools' lack of those features makes the work more difficult.
   4.188 +\subsection{Walk-through}
   4.189 +\label{subsec:walk-through}
   4.190  
   4.191 -To prepare, we describe the program and language used, and then the details of a generated visualization. After this we show a sequence of the visualizations. In each,  we point out how the  performance loss is identified, and which visual features suggest the hypothesis for the cause of the loss.  
   4.192  
   4.193 -\subsection{The Application, and Machine Run On}
   4.194  
   4.195 -In our session, we wish to tune a standard program that the reader has likely already experienced attempting to performance tune, and/or knows well. The best example is likely matrix multiply, with which the reader should be familiar, allowing concentration on the tool without distraction about the application. 
   4.196 -
   4.197 -We run it on a machine with 4 sockets by 10 cores each, for a total of 40 physical cores. They are Intel WestmereEx cores running at 3.0GHz, with TurboBoost turned off. 
   4.198 -
   4.199 -The application code uses language features to create a number of virtual processors (VP). The first VP created runs a set of functions that divide the work into a number of pieces and creates a new VP for each piece. 
   4.200 -
   4.201 -How many pieces is set by a tuning parameter in the code, and the number of cores. It uses language features to distribute the VPs  across the cores, initially in a round-robin fashion.
   4.202 -
   4.203 -It then creates   a results VP that receives a partial-result from each piece and accumulates them. The original divider VP  then waits for the results VP to indicate completion, after which the language  shuts down.
   4.204 -
   4.205 -The language used is SSR, which is based on rendez-vous style send and receive operations made between virtual processors (VPs). It has commands for creating and destroying VPs, and three kinds of send-receive paired operations. 
   4.206 -
   4.207 -The first, \emph{send\_from\_to} specifies a specific sender and specific receiver. It is used by the results VP to tell the divider VP that the work is complete. The second, \emph{send\_of\_type\_to}, specifies a specific receiver, but the sender is anonymous, which increases flexibility while maintaining some control over scope. This  is used by the VPs doing the pieces to send their partial-result to the results processor. The third kind, \emph{send\_of\_type}, only specifies the type, and so acts as a global communication channel; this is not used in our application.
   4.208 -
   4.209 - The language also includes a \emph{singleton} construct that designates a piece of code as to be executed only once, which we use to  rearrange and copy data to get better cache behavior. A given copy is shared by several virtual processors on different cores, but the copy only needs to be performed once. 
   4.210 -
   4.211 -Miscellaneous performance constructs are also available, such as one to force which core a virtual processor is assigned to. We use this in our example program to control scheduling.
   4.212 -
   4.213 -A note on terminology: the word ``task''  has acquired multiple  meanings in the literature, making it a vague term. We often use, instead, the term  ``work-unit'', which is defined precisely as the trace-segment performed on a core, between two successive scheduling events, plus the set of datums consumed by that trace segment. The word task often maps well onto this precise definition, and we use both words, but mean the precise work-unit definition when we say task.
   4.214 -
   4.215 -
   4.216 -
   4.217 -\subsection{The Visualizations}
   4.218 - The first visualization is what we refer to as a scheduling consequence graph, or just consequence graph, CG. It depicts each of the scheduling operations performed by the runtime, and the consequent usage of the cores. 
   4.219 -
   4.220 -The second visualization depicts constraints on those scheduling decisions that come from the application, such as dependencies in the code. These limit the choices the runtime is allowed to make. We call this the Unit \& Constraint Collection, or UCC. 
   4.221 -
   4.222 -The UCC shows only application-derived information, as opposed to the consequence graph, which combines the \textit{use} of the UCC-depicted constraints with runtime-imposed dependencies and hardware-imposed constraints.  Hence, the UCC states the degrees of freedom enabled by the application, while the consequence graph states how those were made use of, by a particular runtime on particular hardware.
   4.223 -
   4.224 -Fig X shows a consequence graph (CG), stylized for purposes of explanation. It is composed of a number of columns, one for each core. A column represents time,  with early at the top, increasing as one goes down, measured in clock cycles. It is broken into blocks, each representing the time accounted to one work-unit. Each block is further divided into regions, each a different color which indicates the kind of activity the core was engaged in during that  region's time-span.  
   4.225 -  
   4.226 -The kinds of activities are defined by the computation model that underlies the visualization. The first is the work of a work-unit, represented by a blue-to-red region. It includes time stalled due to cache misses. The color indicates  intensity of cache misses, with pure red representing at or above the maximum misses per instruction, and pure blue the minimum.  The max and min are set in the tool that generates the visualization.
   4.227 -
   4.228 -  The second kind of activity is runtime overhead, represented by a gray region. This is the overhead spent on that particular work-unit. When desired by the viewer, it is further broken into pieces representing activities inside the runtime. These may include acquisition of a lock on shared semantic state, time spent on constraints determining readiness of the work-unit, on deciding which ready one to assign to which hardware, and time spent switching from virtual processor, to the runtime, and back. In this paper, we show all runtime overhead the same way, however in other circumstances a breakdown can be key to understanding interaction between runtime and application. 
   4.229 - 
   4.230 -
   4.231 -The other type of visual feature seen in Fig X is lines. Each represents a construct that influenced scheduling.   The line depicts  two things: a constraint and a decision, both inside the runtime. The constraint was satisfied, which made the   decision possible, choosing which core to do the work on. 
   4.232 -
   4.233 -In general, other kinds of lines may also be drawn, representing other kinds of interactions that affect core usage. For example,  visualization can be turned on for  the internal runtime constraint that only one core at a time may access shared constraint and scheduling state. This appears as additional lines linking the gray runtime regions of blocks. In this paper, visualization is turned off for such minor interactions. 
   4.234 -
   4.235 -Successive work-unit (blue-violet) blocks that go in sequence often have a causal dependency between them, due to the semantics of the base language.
   4.236 - 
   4.237 -Many different orderings could also have been validly chosen. Which scheduler choices are valid is determined by three kinds of constraints: the application code constraints, hardware constraints, and runtime implementation imposed constraints. 
   4.238 -
   4.239 -Returning to Fig \ref{story}, the lines in red, orange, and green represent application-code constraints that each tie two work-units together. The color represents the  kind of constraint imposed by one kind of  parallelism construct. Red is creation of a virtual processor, green is the many-to-one \texttt{send\_of\_type\_to}, and orange is the singleton construct. For better visibility, only constraints that cross cores are shown.
   4.240 -
   4.241 -The columns represent each core, and the vertical axis represents the progress of the program execution, starting from the top. This gives at a glance the total runtime (height), level of parallelism during the run (number of filled columns at a specific height), and processor utilization (blue-covered area vs. white). All figures are at the same scale, so they can be compared directly.
   4.242 -
   4.243 -\subsection{Walk-through}
   4.244 -
   4.245 -After functional debugging, the first run produces the visualizations seen in Figures X and X. The first thing to notice, is that the first picture is slimmer than expected: of the 40 available cores, only 13 are being used. As the application places work on cores explicitly, this must be a bug in the dividing code. A cursory inspection reveals that a closing curly brace in the distribution loop had been misplaced. This may be a very simple bug, but it went unnoticed despite using this application as test program for development of the language runtime, including performance, for several months.
   4.246 +After functional debugging, the first run produces the consequence graph seen in Figure \ref{fig:story:a}. The first thing to notice, is that it is slimmer than expected: of the 40 available cores, only 13 are being used. As the application places work on cores explicitly, this must be a bug in the dividing code. A cursory inspection reveals that a closing curly brace in the distribution loop had been misplaced. This may be a very simple bug, but it went unnoticed despite using this application as test program for development of the language runtime, including performance, for several months.
   4.247  
   4.248  \subsubsection{Second Run}
   4.249 - The second run (Fig \ref{story:b}) already corresponds much more to the expected execution behaviour. However, there remains a noticeable section at the beginning where only 3 cores have work and the other 37 remain idle. 
   4.250 + The second run (Fig \ref{story:b}) already corresponds much more to the expected execution behaviour. However, there remains a noticeable section at the beginning where only 3 cores have work and the other 37 remain idle.
   4.251  
   4.252  Zooming in on those  cores, we see that the task creation VP animates a chain of short tasks (each with a red edge outgoing), and is assigned to core 0. A task is the work in-between scheduling decisions, and creating a task requires switching to the runtime. In order to animate the next work creation task, the creator VP has to be chosen again. However, the creation VP makes the work for all the cores, and starts with itself, core 0, while the runtime animates tasks in the order they become ready. So, after the creator VP makes a work task for itself, that task is ready, and the next chained creation task is put into the queue behind it. That means the work task is chosen next, and the creation task gets left in the queue while work is done, during which task creation is suspended (the merits of work stealing or other scheduling strategies are independent from this illustration of how to use this approach to performance tuning).
   4.253  
   4.254 @@ -517,79 +534,27 @@
   4.255  
   4.256  In Fig X, ``holes'' are noticeable. Inspecting these holes closer, we can see that the stalling tasks are waiting upon the completion of a singleton. However, the operations enclosed in the singleton take only a short time, and start much later than the idle periods. Once again, the in-order animation of queued of tasks is at fault: When the first VP reaches the singleton code portion, it sends a request to acquire the singleton. This request succeeds, but as sending a request results in suspension of the requesting VP, a new task is scheduled first. If this happens to be a long task, the singleton is suspended for a long time, but all other VPs with this singleton have to wait, because it has already been reserved. Because several VPs assigned to the same core share the same matrix pieces so as to increase cache locality, this can result in all VPs on a core being stalled, leading to the observed idle times. This is a property of the language runtime, so the application programmer cannot change this, but making the work units smaller helps minimize these effects. 
   4.257  
   4.258 -%\begin{figure}[ht]
   4.259 -% \begin{minipage}[b]{0.5\linewidth}
   4.260 -%  \centering
   4.261 -%  \includegraphics[width=0.27in, height=6in]{../figures/184.pdf}
   4.262 -%  \caption{default}
   4.263 -%  \label{fig:figure1}
   4.264 -% \end{minipage}
   4.265 -%\hspace{0.5cm}
   4.266 -% \begin{minipage}[b]{0.5\linewidth}
   4.267 -%  \centering
   4.268 -%  \includegraphics[width=1in]{../figures/.pdf}
   4.269 -%  \caption{default}
   4.270 -%  \label{fig:figure2}
   4.271 -% \end{minipage}
   4.272 -%\end{figure}
   4.273  
   4.274  
   4.275 -%\begin{figure*}
   4.276 -%\centering
   4.277 -%\mbox
   4.278 -% { \subfigure[description of left graph]
   4.279 -%    {\includegraphics[width=0.2in]{../figures/192.pdf} 
   4.280 -%    }\quad
   4.281 -%   \subfigure[description of right graph]
   4.282 -%    {\includegraphics[width=1in]{../figures/194.pdf} 
   4.283 -%    }
   4.284 -%   \subfigure[description of right graph]
   4.285 -%    {\includegraphics[width=1in]{../figures/196.pdf} 
   4.286 -%    }
   4.287 -%   \subfigure[description of right graph]
   4.288 -%    {\includegraphics[width=1in]{../figures/199.pdf} 
   4.289 -%    }
   4.290 -%   \subfigure[description of right graph]
   4.291 -%    {\includegraphics[width=1in]{../figures/201.pdf} 
   4.292 -%    }
   4.293 -%   \subfigure[description of right graph]
   4.294 -%    {\includegraphics[width=1in]{../figures/204.pdf} 
   4.295 -%    }
   4.296 -%   \subfigure[description of right graph]
   4.297 -%    {\includegraphics[width=1in]{../figures/208.pdf} 
   4.298 -%    }
   4.299 -% }
   4.300 -%\mbox
   4.301 -% {  \subfigure[description of left graph]
   4.302 -%    {\includegraphics[width=1in]{../figures/209.pdf} 
   4.303 -%    }\quad
   4.304 -%   \subfigure[description of right graph]
   4.305 -%    {\includegraphics[width=1in]{../figures/210.pdf} 
   4.306 -%    }
   4.307 -%   \subfigure[description of right graph]
   4.308 -%    {\includegraphics[width=1in]{../figures/212.pdf} 
   4.309 -%    }
   4.310 -% }
   4.311 -%\caption{Text pertaining to all graphs ...} \label{fig12}
   4.312 -%\end{figure*}
   4.313  
   4.314  
   4.315  
   4.316 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   4.317  \section{The Model Behind the Visualization}
   4.318 -\label{theory}
   4.319 +\label{sec:theory}
   4.320  
   4.321  Now that the usage has been seen, we expand on the model behind the visualizations, which is what ties the information together. Understanding of the model leads to quickly seeing the reason for performance-related patterns in the visualizations. Such understanding generates the hypotheses of the source of performance loss. 
   4.322  
   4.323 -The model has two parts, a \emph{Unit \&\ Constraint Collection (UCC)}, and a \emph{Scheduling Consequence Graph} (just consequence graph or CG).  The UCC indicates the freedom of choice the application allows, encoding what the programmer has control over. The consequence graph says which of those were actually taken during the run and the consequences of that set of choices. 
   4.324 +The model has two parts, a \emph{Unit \&\ Constraint Collection (UCC)}, and a \emph{Scheduling Consequence Graph} (just consequence graph or CG).  The UCC indicates the freedom of choice the application allows, encoding what the programmer has control over. The consequence graph says which of those were actually taken during the run and the consequences of that set of choices.
   4.325  
   4.326  We give a more precise description of UCC then consequence graph, in turn.
   4.327  However, space is too limited for a complete definition, which is given in a companion paper submitted to a longer format venue.
   4.328  \subsection{Unit \& Constraint Collection}
   4.329 -The UCC contains all units of work that get scheduled and all application-related constraints on scheduling them. That's a nice solid definition but things aren't quite that simple. The complication is that different classes of application exist, with two degrees of freedom that determine how much of the UCC is actually defined in the application vs the input data, or even the scheduler.
   4.330 +The UCC contains all units of work that get scheduled and all application-related constraints on scheduling them. That's a nice solid definition, but things aren't quite that simple. The complication is that different classes of application exist, with two degrees of freedom that determine how much of the UCC is actually defined in the application vs the input data, or even the scheduler.
   4.331  
   4.332 -Some applications have everything determined in the code, with all units fixed, and all constraints fixed. An example is matrix multiply with fixed size matrices.  But for others, the shape of the UCC is only partially defined by the application code.  Take matrix multiply when the size is an input parameter, and the toolchain divides the work into a fixed number of pieces. Then there are a fixed number of units, but their size depends on the input parameter. So, the UCC is different for each parameter value. An extreme example is an NP complete problem, for which the units are a function of both the particular input data \emph{and} decisions made by the scheduler!
   4.333 +Some applications have everything determined in the code, with all units fixed, and all constraints fixed. An example is matrix multiply with fixed size matrices.  But for others, the shape of the UCC is only partially defined by the application code.  Take matrix multiply when the size is an input parameter,  the units in the UCC are different for each parameter value. An extreme example is an NP complete problem, with redividable units, for which the units are a function of both the particular input data \emph{and} decisions made by the scheduler!
   4.334  
   4.335 -    We call a fully specified UCC a \emph{concrete} UCC.  Every run of an application eventually winds up defining a concrete UCC, as seen in this paper. But the amount of UCC made concrete by the application alone falls into a two-dimensional grid. One dimension covers the units, the other the constraints.
   4.336 +    We call a fully specified UCC a \emph{concrete} UCC.  Every run of an application eventually winds up defining a concrete UCC, such as seen back in Fig X. But the amount of UCC made concrete by the application alone falls into a two-dimensional grid. One dimension covers the units, the other the constraints.
   4.337  
   4.338  
   4.339  \begin{figure}[ht]
   4.340 @@ -599,50 +564,43 @@
   4.341    \label{fig:UCC_Concreteness}
   4.342  \end{figure}
   4.343  
   4.344 -Figure \ref{fig:UCC_Concreteness} shows the two axes and the four sets of information on each that are required to make the given UCC fully concrete. The horizontal axis is for  units, vertical for constraints.  0 indicates that the units/constraints are fully fixed by the application code alone; 1 means parameter values must be known in addition to code; 2 means input data values also play a role, and 3 means the units/constraints can only become known during the run, as scheduler/runtime decisions affect the units and/or constraints. 
   4.345 +Figure \ref{fig:UCC_Concreteness} shows the two axes and the four sets of information on each, which act as the inputs that determine the units and constraints. The position a UCC lands on the grid indicates the information still needed  in order to make the  UCC fully concrete. The horizontal   indicates what inputs are still needed to determine the units, and vertical the constraints.  0 indicates that the units (constraints) are fully determined by the application code alone; 1 means parameter values also must be known; 2 means input data values also play a role, and 3 means the units (constraints) can only become known  after runtime scheduling decisions have been made. 
   4.346  
   4.347 -The closer the UCC is to the origin, the less information it needs to become concrete. The UCC labeled A in the figure is fully concrete just from the source code alone (representing for example, matrix multiply with fixed size matrices). The UCC labeled B requires the input data and parameters to be specified before its units are concrete, but just parameters to make its constraints fully concrete (as per ray-tracing, with bounce depth specified as a parameter). The UCC labeled C only has variability in its constraints, which require input data (for example, H.264 motion vectors).
   4.348 -
   4.349 -(as for D in the figure, representing an NP complete problem with a runtime that is able to stop a unit in progress and re-divide it, as in [DKU paper]) 
   4.350 -
   4.351 -But even the least concrete UCC, out at the end of the diagonal, becomes concrete during a run of the application (D in the figure). As the computation unfolds, data values interact with scheduling decisions, to fix the units and  constraints on them, until all have been made concrete by the end of the run.
   4.352 -
   4.353 -Notice, though, that such a concrete UCC still has degrees of freedom. The same units could have been chosen, with the same constraints on them, but different decisions made about which hardware the units were assigned to, and which order the assignment was made in. These decisions interact with the hardware, to yield the communication patterns and consequent performance seen during the run. 
   4.354 +The closer the application-derived UCC is to the origin, the less additional information it needs to become concrete. The UCC labeled A in the figure is fully concrete just from the source code alone (representing for example, matrix multiply with fixed size matrices). The UCC labeled B requires the input data and parameters to be specified before its units are concrete, but just parameters to make its constraints fully concrete (as per ray-tracing, with bounce depth specified as a parameter). The UCC labeled C only has variability in its constraints, which require input data (for example, H.264 motion vectors).
   4.355 +But even the least concrete UCC, out at the end of the diagonal (D in the figure), becomes concrete during a run of the application.
   4.356 + 
   4.357 +Notice, though, that  a fully concrete UCC still has degrees of freedom in what hardware and order the units are assigned.  These decisions interact with the hardware, to yield the communication patterns and consequent performance seen during the run. 
   4.358  
   4.359  An added twist is that an application has a life-line, spanning from code through the run, and its representation  may change at the different stages of life. It starts as pristine source, then moves  into specialization where code is translated into different representations than the original, and finally the specialized code is run. The UCC often changes between points in  the life-line.
   4.360  
   4.361 -For example, specialization may perform a transform that fixes some units, concretizing the UCC. Alternatively, the toolchain may inject manipulator code that allows the runtime scheduler to divide units during the run. It would stop units and divide them to make more when it runs low. The injection of manipulator code moves the UCC further out on the diagonal.
   4.362 +For example, specialization may perform a static scheduling, which fixes the units, moving the UCC towards the origin. Alternatively, the toolchain may inject manipulator code for the runtime to use, which lets it divide units during the run when it needs more. The injection of manipulator code makes the UCC less concrete, moving it further from the origin.
   4.363  
   4.364 -Applications that are far out on the diagonal only specify a portion of the structure of the UCC, but the UCC still tells what is inside the application's control vs under the scheduler's control. It thus indicates what can be done statically. The further out on the diagonal a UCC is, the less scheduling can be done statically in the toolchain.
   4.365 +The UCC still tells what is inside the application's control vs under the runtime's control, even for applications that land far out on the diagonal. It thus indicates what can be done statically. The further out on the diagonal a UCC is, the less scheduling can be done statically in the toolchain.
   4.366  
   4.367 -In this paper, we do not suggest how to represent UCCs far out on the diagonal. Those actually indicate a multi-verse of concrete-UCCs. Which of them materializes  depends on the data that shows up and what the scheduler does. We only represent the concrete UCC that materializes during a run and leave the question of representing less concrete ones to future work.
   4.368 +In this paper, we do not suggest how to represent UCCs far out on the diagonal. One of those actually indicates a multi-verse of concrete-UCCs. Which of them materializes  depends on the data that shows up and what the scheduler does. We only represent the concrete UCC that materializes during a run and leave the question of representing less concrete ones to future work.
   4.369    
   4.370  
   4.371 +\subsection{Scheduling Consequence Graph}
   4.372 +Whereas the UCC concentrates on application-derived information, the second part of the model adds-in effects of runtime  details and hardware. It's called the Scheduling Consequence Graph because it links scheduling decisions to their performance consequences. But it also indicates the role, in the decision, of application, runtime and hardware details.
   4.373 +As a result it identifies instances of lost performance, and links them to the cause of the loss, as seen in Section \ref{sec:casestudy}.
   4.374  
   4.375 -\subsection{Scheduling Consequence Graph}
   4.376 -Whereas the UCC concentrates on application-derived information, the second part of the model adds-in constraints caused by runtime implementation details and hardware. It's called the Scheduling Consequence Graph, because it links scheduling decisions to their performance consequences and the hardware details involved with those consequences.
   4.377  
   4.378 -The consequence graph integrates all three aspects of parallel performance: application, runtime, and hardware. It identifies all instances of lost performance, and links them to the cause of the loss, as was seen during the tuning story in Section X.
   4.379 -
   4.380 -This graph captures the three kinds of causal relationships that affect core usage: scheduling decisions invoked by the runtime, dependencies inside the runtime itself, and hardware constraints. Each of these  causes some portion of the core usage that took place, and shows up as a feature on the consequence graph. 
   4.381  
   4.382  To distinguish from the UCC, the consequence graph shows the behavior resulting from scheduling decisions actually \emph{made},   from among those \emph{possible.} The UCC shows just the possibilities. Hence, a consequence graph shows \emph{one} of the possible choice-sets allowed by the UCC.
   4.383  
   4.384 -A consequence graph accounts for each bit of core time. It has boxes that each represent one segment of core time, and arcs between the boxes that each represent a causal dependency of some kind. Each box is counted towards one work-unit, and  all boxes counted to the same unit make a node.
   4.385 +A consequence graph accounts for each bit of core time. It has boxes and arcs. The boxes  each represent one segment of core time, which is counted towards one work-unit, and  the collection of boxes counted to the same unit make a node. An arc links two boxes  and represents a causality of some kind.
   4.386  
   4.387  There are several kinds of boxes, one for each reason that the core is being used (or being forced idle), and several kinds of arcs, one for each type of causality between boxes.
   4.388  
   4.389 -The box types are arranged by cause of core usage: application work, waiting for communication of work data, runtime internals, managing constraints, and choosing assignment of work onto cores. The runtime internals have sub-categories but space is limited so we skip those here.
   4.390 +The box types are arranged by reason for core usage: application work, waiting for communication of work data, managing constraints, choosing assignment of work onto cores, and runtime internals. The runtime internals have sub-categories but space is limited so we skip those here.
   4.391  
   4.392 -The arc types are arranged by source of the causal relationship: base-language control dependencies, parallel semantic constraint which fed into a particular choice in the runtime (IE, the choice ties together two specific work-units so the one completing causes other to start), runtime internal causality such as a global lock or a distributed quorum algorithm whose action creates causal dependencies between boxes that represent execution of internal runtime code, and arcs that represent hardware causal relationships, such as one work-unit finishing on a core causes another work-unit to start there, given the choice by the runtime. The finer details are beyond the scope of this paper.
   4.393 -
   4.394 -
   4.395 +The arc types are arranged by source of the causal relationship: control dependency in the base language, parallel constraint that fed  a choice in the runtime (IE, the choice ties together two specific work-units so the one completing causes other to start), runtime internal causality such as a global lock or a distributed quorum algorithm whose action creates causal dependencies between boxes that represent execution of internal runtime code, and arcs that represent hardware causal relationships, such as one work-unit finishing on a core causes another work-unit to start there, given the choice by the runtime. The finer details are beyond the scope of this paper.
   4.396  
   4.397  We will now look at each source of causal relationship.
   4.398  
   4.399 -\paragraph{Scheduling decision causality} Notice that the performance varies between choice-sets. The main reason for the variation is the communication patterns resulting from the particular choices. For a fixed concrete UCC, each set of scheduling choices it allows has a consequent pattern of core usages. That pattern \emph{is} the performance realized. 
   4.400 +\paragraph{Scheduling decision causality} Notice that the performance varies between choice-sets. The variations reflect differences in communication time, which result from the placement of work, chosen by the scheduler. For a particular concrete UCC, each set of scheduling choices it allows has a consequent pattern of core usages.  A given choice ties the unit that completed as the cause of starting the unit chosen. 
   4.401  
   4.402 -The consequence graph also shows control dependencies from the base language, which may add superfluous constraints that further eliminate some otherwise allowed choices in the UCC. An example would be a \texttt{for} loop that creates work-units -- no parallelism constructs cause a sequentialization of the creations, but the base C language sequentializes it nonetheless. 
   4.403 +The consequence graph also includes control dependencies from the base language, which may add superfluous constraints that  eliminate some otherwise allowed choices in the UCC. An example would be a \texttt{for} loop that creates work-units -- no parallelism constructs cause the creations to be done in sequence, but the base C language sequentializes it nonetheless. 
   4.404  
   4.405  \paragraph{Runtime causal dependencies} are introduced by implementation details. For example, the version of VMS we instrumented to take measurements for this paper relies upon a global lock for shared semantic-constraint information. This lock introduces a causal dependency that only allows the runtime to execute on one core at any horizontal line in the consequence graph.
   4.406  
   4.407 @@ -655,7 +613,6 @@
   4.408  Consequence graph features each tie back to features in the UCC and thence to specific segments of code or constructs.
   4.409  
   4.410  
   4.411 -
   4.412  \subsection{Levels of UCC and Consequence Graph}
   4.413  There is one last twist to the story of UCCs and consequence graphs, which is that there are levels of them that correspond to the levels of scheduler in a hierarchical machine. We use an example involving a server machine with a hierarchy of runtimes to illustrate both, concentrating first on just the UCCs, then adding the consequence graph. 
   4.414  
   4.415 @@ -689,16 +646,17 @@
   4.416  
   4.417  The last question is how to handle communication consequences. This is tricky because decisions in higher-level runtimes set the context for decisions in lower-level ones. This means a higher-level choice is linked to the consequences from lower-level choices. The value of a consequence graph is due to linking the size of boxes in it to the decisions made by the scheduler, as represented by the shape. It's not clear how to divide, among the levels, the time that cores spend waiting for non-overlapped communication. We have no good answer at the moment and leave it for future work.
   4.418  
   4.419 +
   4.420 +
   4.421 +
   4.422 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   4.423  \section{Implementation}
   4.424  \label{sec:Implementation}
   4.425  
   4.426  Now that the usage and theory are in hand, we give the details of implementation. We attempt a bottom-up view, to provide an alternate path to understanding the model and visualizations,  as well as providing a path to derive the benefits directly from the details. 
   4.427  
   4.428 -
   4.429  This section will show the level of effort needed to implement our approach for a new language. In short, this involves inserting collection points into the runtime, then modifying the post-processing that produces the visualization.
   4.430  
   4.431 -
   4.432 -
   4.433  We cast the implementation in terms of the computation model, then identify the
   4.434  points inside the runtime that correspond to points in the model. It is only in these spots that instrumentation gets inserted into runtime code. 
   4.435  
   4.436 @@ -709,7 +667,7 @@
   4.437  
   4.438  \subsection{Meta-units and unit life-line in the computation model}
   4.439  
   4.440 -In preparation for mapping the model onto implementation details, we define a meta-unit and unit life-line. These form the basis to decide  the points in the runtime at which data is collected.
   4.441 +In preparation for mapping the model onto implementation details, we define a meta-unit and unit life-line. These form the basis for deciding points in the runtime  where data is collected.
   4.442  
   4.443  Every unit has a meta-unit that represents it in the runtime. A  unit is defined as the trace of application code that exists between two scheduling decisions. Looking at this in more detail, every runtime has some form of internal bookkeeping state for a unit, used to track constraints on it and make decisions about when and where to execute. This exists even if that state is just a pointer to a function that sits in a queue. We call this bookkeeping state for a unit the meta-unit.
   4.444  
   4.445 @@ -717,24 +675,22 @@
   4.446  
   4.447   Note a few implications: first, many activities internal to the runtime are part of a unit's life-line, and take place when only the meta-unit exists, before or after the work of the actual unit; second, communication that is internal to the runtime is part of the unit life-line, such as state updates; third, creation may be implied, such as in pthreads, or triggered such as in dataflow, or be by explicit command such as in StarSs, and once created, a meta-unit may languish before the unit it represents is free to be animated.
   4.448  
   4.449 -Also, note that this explains why the visualizations remain largely the same across languages. The concepts of a meta-unit, a unit, constraints on a unit, and a unit life-line are all valid in every language.  The visualizations are based on these concepts, and so likewise largely remain the same.  In the UCC, only the constraint patterns that represent  the language's constructs change between languages. In the CG, only which construct a line in the CG represents changes.
   4.450 +Also, note that this explains why the visualizations remain largely the same across languages. The concepts of a meta-unit, a unit, constraints on a unit, and a unit life-line are all valid in every language.  The visualizations are based on these concepts, and so likewise largely remain the same.  In the UCC, only the constraint patterns that represent  the language's constructs change between languages. In the SCG, only which construct a line in the SCG represents changes.
   4.451  
   4.452  \subsection{Mapping model onto implementation details in runtime}
   4.453  
   4.454 -The meta-unit and life-line aspects of the computation model map straight-forwardly to the UCC visualization. The constraints in the UCC are those stated in or implied by the application (with the complexities noted in Section \ref{theory}). 
   4.455 +The meta-unit and life-line aspects of the computation model map straight-forwardly to the UCC visualization. The constraints in the UCC are those stated in or implied by the application (with the complexities noted in Section \ref{sec:theory}). 
   4.456  
   4.457 -However, the CG is not a strict expression of the model, rather it's purpose is practical. It shows usage of the cores, and relates that to the quantities in the model. Hence, the measurements for the CG all are boundaries, where the core's time switches from one category in the model to a different.
   4.458 +However, the SCG is not a strict expression of the model, rather it's purpose is practical. It shows usage of the cores, and relates that to the quantities in the model. Hence, the measurements for the SCG all are boundaries, where the core's time switches from one category in the model to a different.
   4.459  
   4.460 -This differs from the model in subtle ways. Most notably, the model declares segments of time where communications take place, while the CG doesn't measure the communication time directly, rather it captures idleness of the core caused by the non-overlapped portion of that communication. 
   4.461 +This differs from the model in subtle ways. Most notably, the model declares segments of time where communications take place, while the SCG doesn't measure the communication time directly, rather it captures idleness of the core caused by the non-overlapped portion of that communication. 
   4.462  
   4.463 -The reason for the difference is that the CG is intended to highlight the quantity of most interest to application coders, which is accounting for core usage and assigning each idle period to a cause. The choice of units to cores is what determines the source and destination of communications. Hence, non-overlapped communication idle periods are consequences of the  assignment choices made by the scheduler.  This, by the way, leads to the name: scheduling consequence graph.  
   4.464 +This difference stems from the SCG's focus on core usage, and assigning each idle period to a cause. The runtime's choice of units to cores is what determined the source and destination of communications, which caused the idling. Hence, idle periods due to non-overlapped communication  are consequences of the  assignment choices made by the scheduler.  This, by the way, leads to the name: scheduling consequence graph.  
   4.465  
   4.466   
   4.467 +What must be collected during the run differs between the two types of visualization. For the UCC it is unit boundaries and the constraints related to each unit. For the SCG, the same units must  be collected, but also the time a core spends on each segment of the unit's life-line.  Also, implementation details of the runtime will cause things such as idling the core during lock acquisition to be counted towards a unit's life segment. What core activities go to which life segments changes from runtime to runtime. For example, our implementation includes idle time due to acquiring the lock on shared runtime state as part of the state-update life-line step. 
   4.468  
   4.469 -What must be collected during the run differs between the two types of visualization. For the UCC it is unit boundaries and the constraints related to each unit.
   4.470 -For the CG, the same units must  be collected, but also the time a core spends on each segment of the unit's life-line.  Also, implementation details of the runtime will cause things such as idling the core during lock acquisition to be counted towards a unit's life segment. What core activities go to which life segments changes from runtime to runtime. For example, our implementation includes idle time due to acuiring the lock on shared runtime state as part of the state-update life-line step. 
   4.471 -
   4.472 -The SCG also represents each  cause of a transition from one segment of core usage to another as an arc between boxes. Such a causation is always a causal dependency of some kind, even if it corresponds to a complex construct in the application. These causations are collected and  tied to one of: construct constraint, runtime internal constraint (such as must acquire lock), or hardware constraint (such as only one activity at a time on a core). In this paper, all are collected, but the only causations visualized are constructs  that cross cores, with propendent on one core and its dependent on another.
   4.473 +The SCG represents each  cause of a transition from one segment of core usage to another as an arc between boxes. Such a causation is always a causal dependency of some kind, because the SCG only represents physical events, even if it corresponds to a complex construct in the application. These causations are collected and  tied to one of: construct constraint, runtime internal constraint (such as must acquire lock), or hardware constraint (such as only one activity at a time on a core). In this paper, all are collected, but the only causations visualized are constructs  that cross cores, with propendent on one core and its dependent on another.
   4.474  
   4.475  \subsection{Instrumenting our implementation of SSR on top of VMS}
   4.476  
   4.477 @@ -743,57 +699,55 @@
   4.478  VMS also has the advantage for our approach of being written in accordance with the computation model, which makes instrumenting it especially convenient. Each language construct has its own handler into which to insert measurement code, and transitions in unit life-lines also have convenient locations in VMS to insert instrumentation code.
   4.479  
   4.480  \subsubsection{SSR background}
   4.481 -A distinction important in understanding SSR and other parallel languages is being task-based versus virtual processor (VP) based. Task-based languages include dataflow, CnC, and StarSs.  These tasks don't suspend and resume, but rather execute to completion. Hence, such a task is the same as our definition of unit. They have no state that persists across calls to the runtime. In contrast, a virtual processor does suspend and resume and so has state that persists across runtime calls. Examples include pthreads, OpenMP thread-based constructs, UPC, and so on.
   4.482 +A distinction important to understanding SSR and other parallel languages is being task-based versus virtual processor (VP) based. Task-based languages include dataflow, CnC, and StarSs.  These tasks don't suspend and resume, but rather execute to completion. Hence, such a task is the same as our definition of unit. They have no state that persists across calls to the runtime. In contrast, a virtual processor does suspend and resume and so has state that persists across runtime calls. Examples include pthreads, OpenMP thread-based constructs, UPC, and so on.
   4.483  
   4.484  SSR is based on virtual processors. They execute sequential code that occasionally calls a parallel construct, which suspends the VP and switches to the runtime. This means that each VP contains a sequence of units, with each unit the trace-segment between two SSR library calls.
   4.485   
   4.486  
   4.487 -SSR has both deterministic constraints, which specify the source and destination VP, such as send\_from\_to, and non-deterministic ones, in which the runtime is what chooses which VPs interact, such as send\_to\_of\_type and singleton. Deterministic ones display the same in the UCC and the SCG. However, non-deterministic constraints need all possibilities to be determined for the UCC, requiring extra instrumentation code.
   4.488 +SSR has both deterministic constraints, which specify the source and destination VP, such as send\_from\_to, and non-deterministic ones, in which the runtime is what chooses which VPs interact, such as send\_of\_type\_to\ and singleton. Deterministic ones display the same in the UCC and the SCG. However, non-deterministic ones need all possibilities to be determined for the UCC, requiring extra instrumentation code.
   4.489  
   4.490  \subsubsection{Collecting a unit}
   4.491 -A unit begins when transitioning out of the runtime and into the unit's application code. It ends by leaving the unit's code back into the runtime. These transitions both have a convenient spot in VMS at which to insert the code that records the units. 
   4.492 +Code to record a new unit is inserted into VMS at the transition out of the runtime and into application code. Code to record the unit end is inserted into the VMS primitive that switches back to the runtime. 
   4.493  
   4.494 -\subsubsection{Collecting the constraints} In VMS, each language construct has its own handler. Into each handler, code is inserted to record which unit invoked the construct, and any units freed by it. The SCG needs  to link the unit that made a construct call to the unit freed by that call.
   4.495 +\subsubsection{Collecting the constraints} In VMS, each language construct has its own handler. Code is inserted into each handler, to record which unit invoked the construct, and any units freed by it. The SCG links the unit that made a construct call to the units freed by that call.
   4.496  
   4.497  What information needs to be collected for SCG and UCC and how it is done depends on the construct:
   4.498  \begin{itemize}
   4.499 -\item create\_VP: The creation of a new VP implies a simple dependency: the first unit in the new VP may only execute after the  creating unit ends via the create call. We place code into the create\_VP\ handler, which records  the calling VP\ + unit, along with the newly created unit, and the VP it is assigned to.
   4.500 -\item send\_from\_to and receive\_from\_to: These constructs are rendez-vous based, meaning that the unit following the call can only begin after \emph{both} VPs involved have completed the call. We represent this by two crossing dependencies. The constructs are deterministic, so both the UCC and SCG use the same recording. Code is placed into both handlers at the point that checks if both the rendez-vous requests are present. When true, it records both the unit+VPs that connected.
   4.501 -\item Send\_to\_of\_type and receive\_to\_of\_type: These are also rendezvous based, but the pairing of sender and receiver is not deterministic. The SCG   represents the decision made inside the runtime, so inside the handler, the later of the two units connected by the runtime records the pairing.  But for the UCC, we want to capture all sending and receiving permutations available, so we add code that collects the group of senders and the corresponding group of receivers.
   4.502 -\item Singleton: For the singleton construct, there is a single unit with a group of predecessor units and a group of successor units. The first predecessor to complete enables the singleton unit. All successors must wait for completion of the singleton. We insert code into the handler that records the predecessor that enabled the singleton, which is all that the SCG needs. For the UCC, we add code inside the call to start the singleton. It records the unit that the call ended, and the successor-unit it causes to start.
   4.503 -
   4.504 +\item create\_VP:  We place code into the create\_VP\ handler, which records  the calling VP\ + unit, along with the newly created unit, and the VP it is assigned to. Both the SCG and UCC draw arcs between creating unit and created.
   4.505 +\item send\_from\_to and receive\_from\_to: \textbf{} Code is placed into both handlers at the point that checks if both the rendez-vous requests are present. When true, it records both the unit+VPs that connected. The UCC  and SCG both represent this by two crossing dependencies.
   4.506 +\item Send\_to\_of\_type and receive\_to\_of\_type:   The same code is inserted to record both the unit+VPs that connected. This is enough for the SCG.  But for the UCC, we want to capture all sending and receiving permutations available, so we add code that collects the group of senders and the corresponding group of receivers.
   4.507 +\item Singleton:   The singleton unit has a group of predecessor units and a group of successor units. The first predecessor to complete enables the singleton unit, while all successors must wait for its completion. We insert code into the handler, which records the predecessor that enabled the singleton. This is all that the SCG needs. For the UCC, we add code inside the singleton call that collects the calling unit, adding it to the predecessor group, and the unit it jumps to, adding that to the successor group.
   4.508 + 
   4.509  \end{itemize}
   4.510  
   4.511  \subsubsection{Recording time, instructions, and cache misses }
   4.512 - Just recording the units and connections between them is not enough. Because the SCG represents core usage, it also needs  the time spent on each activity, including internal runtime activities, to be recorded. Each interval of time is assigned to a  segment of a particular unit's life-line.
   4.513 + Just recording the units and connections between them is not enough. Because the SCG represents core usage, it also needs  the cycles spent on each activity, including internal runtime activities. The size of each interval of core usage is recorded and assigned to a  segment of a particular unit's life-line.
   4.514  
   4.515 -The UCC makes use  of the number of instructions in a unit, as an estimate of size of work in the unit, as illustrated by Fig [UCC same-sz vs UCC instr-sz]. Without knowing the relative size of the units, it is hard to estimate the amount of parallelism \emph{usefully} available in the application.
   4.516 +The UCC also makes use  of the number of instructions in a unit, as an estimate of size of work in the unit, as illustrated by Fig [UCC same-sz vs UCC instr-sz]. Without knowing the relative size of the units, it is hard to estimate the amount of parallelism \emph{usefully} available in the application.
   4.517  
   4.518  To measure the instructions, cycles, and communication (cache misses), we use hardware performance counters. Readings are inserted into the runtime code to capture core time spent on each segment of the life-line of a unit: 
   4.519  \begin{enumerate}
   4.520 -\item Create meta-unit: In VMS, this is measuring the time the create\_VP construct handler consumes on the core. 
   4.521 -\item Update constraints: In VMS, this is the time spent inside the language-supplied construct-handler function.
   4.522 +\item Create meta-unit: This is the time spent inside the create\_VP construct's handler function. 
   4.523 +\item Update constraints: This is the time spent inside the handler functions that implement the constructs.
   4.524  \item Decision to animate: This is the time spent inside the language-supplied assigner function.
   4.525  \item Move meta-unit to core: This is via shared variables, recorded as part of 3.
   4.526  \item Move work data to core: This is via cache misses, recorded as part of 6.
   4.527  \item Do the work of the unit: This is measured by instrumenting the VMS switch-to-unit primitive and the corresponding switch-to-runtime primitive. 
   4.528 -\item Communicate state update: in VMS, this is the time between leaving the application code and starting the construct handler (which includes lock acquisition).
   4.529 -\item Resulting constraint updates: in VMS, this is the time spent inside the construct handler, and is the same as 2 
   4.530 +\item Communicate state update: This is the time between leaving the application code and starting the construct handler (which includes lock acquisition).
   4.531 +\item Resulting constraint updates: This is the time spent inside the construct handler, and is the same as 2.
   4.532  \end{enumerate}
   4.533  
   4.534 -In summary, to cover each of the segments of a unit's life-line, code to read  the performance counters is inserted at:
   4.535 +To cover each of those segments of a unit's life-line, code to read  the performance counters is inserted at:
   4.536  
   4.537  \begin{itemize}
   4.538 -\item Construct handler: To measure 2 and 8, reading is done before and after VMS calls the language-supplied construct handler function
   4.539 -\item
   4.540 +\item Construct handler: To measure 2 and 8, reading is done before and after VMS calls the language-supplied construct handler function.\item
   4.541  Assigner: To measure 3 and 4, reading is done before and after VMS calls  the language-supplied assigner function.
   4.542  \item Work: To measure 5 and 6, reading is done at the point VMS switches to the unit, and the point it switches back into the runtime.
   4.543  
   4.544  \item 
   4.545 -Dual-use: To measure 1, the construct handler's reading points are used. To measure 7, the reading done upon switching into runtime is coupled to the reading done just before starting the construct handler function.
   4.546 +Dual-use:  1 is measured by using the reads on the create\_VP construct handler. To measure 7, the reading done upon switching into runtime is coupled to the reading done just before starting the construct handler function.
   4.547  \end{itemize}
   4.548  
   4.549 -  
   4.550  
   4.551  For clarity, all but work are grouped as overhead in the visualization, but they could be displayed separately if needed.
   4.552  
   4.553 @@ -804,23 +758,32 @@
   4.554  Both the UCC and the SCG are represented as directed graphs, with units as nodes.
   4.555  
   4.556  \subsubsection{UCC}
   4.557 -For the UCC, units can be either unweighted or weighted. Weighted units appear as rectangles with height proportional to the weight, unweighted units are circles. Our implementation can use the number of instructions in the work section from a run to weight the units. This removes some of the influence of scheduling and data, such as cache misses, but can be insufficient if the application is strongly data- or scheduling-dependent.
   4.558 +For the UCC, units can be either unweighted or weighted. Weighted units appear as rectangles with height proportional to the weight, unweighted units as circles. We weight the units with the number of instructions in the work. This removes some of the influence of scheduling and data, such as cache misses.
   4.559  
   4.560 -Simple, deterministic dependencies are represented as arcs. Complicated constraints are for now displayed as an additional node bearing information on the constraint with incoming arcs from all units whose execution status affects the constraint and outgoing arcs to the constrained units.
   4.561 +A critical path algorithm calculates vertical position of a unit by its position in the critical path. The nodes are spread horizontally such that none overlap.
   4.562  
   4.563 -A critical path algorithm is then used to place the nodes vertically, from top to bottom. For non-deterministic constraints, it is possible to enable or disable their participation in the path. Enabling them will lead to an over-estimation of the critical path, disabling them to an under-estimation (better solutions welcome). The nodes are spread horizontally such that none overlap.
   4.564 +Simple constraints (dependencies) are represented as arcs. Complicated constraints are for now displayed as an additional node bearing information on the constraint, with incoming arcs from all units whose execution status affects the constraint and outgoing arcs to the constrained units.
   4.565 +
   4.566  
   4.567  \subsubsection{SCG}
   4.568  
   4.569 -For the SCG, all nodes are weighted with the number of cycles spent on the unit in total (work + overhead). Nodes are then further displayed separated into overhead and work. Because it displays a concrete run, the actual choices made for all non-deterministic constraints are available, so all dependencies are deterministic. The same critical path algorithm as for the UCC is used to place nodes vertically, but this time horizontal placement is determined by the core on which the unit was executed  (hardware dependencies ensure no overlap).
   4.570 +For the SCG, all nodes are weighted with the number of cycles spent on the unit in total (work + overhead). For display, the nodes  are  split into overhead and work. The same critical path algorithm as for the UCC is used to place nodes vertically, but this time horizontal placement is determined by the core on which the unit was executed  (hardware dependencies ensure no overlap).
   4.571  
   4.572 -A selection of constraints can then be overlaid, with color codes by type. In SSR we display, as the default, creation, direct and typed message sending (but not the crossing dependency from the receiver back to the sender), and singleton outgoing dependencies; but each type can be individually hidden or shown.
   4.573 +Constraints can then be overlaid, color coded by type. By default, in SSR, we display creation, direct and typed message sending (but not the crossing dependency from the receiver back to the sender), and singleton outgoing dependencies; but each type can be individually hidden or shown.
   4.574  
   4.575  All this information is taken purely from the runtime, leading to a rich, configurable visualization without needing to add anything to the application.
   4.576  
   4.577 +
   4.578 +
   4.579 +
   4.580 +
   4.581 +
   4.582 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   4.583  \section{Conclusion}
   4.584 -\label{conclusion}
   4.585 -We have shown how to apply a computation model to instrument a language runtime for collecting measurements that connect: each measurement to others, to application structure, to scheduling decisions, and to hardware. A simple visualization of the data has features that indicate lost performance, and features that visually link the lost performance to the cause, no matter if the cause is application structure, language runtime implementation, or hardware feature.  It is this linkage, due to the computation model, that sets this approach apart from others. 
   4.586 +\label{sec:conclusion}
   4.587 +We have shown how to apply a computation model to instrument a language runtime for collecting measurements that connect: each measurement to others, to application structure, to scheduling decisions, and to hardware. A simple visualization of the data has features that indicate lost performance, and features that visually link the lost performance to the cause, no matter if the cause is application structure, language runtime implementation, or hardware feature.  It is this linkage, due to the computation model, that sets this approach apart from others.
   4.588 +
   4.589 +\end{document} 
   4.590  
   4.591  ===============  
   4.592