@@ -610,7 +610,7 @@ commonly called 'loop tiling':
610610 ... assumptions= " n mod 16 = 0 and n >= 1" )
611611 >>> knl = lp.split_iname(knl, " i" , 16 )
612612 >>> knl = lp.split_iname(knl, " j" , 16 )
613- >>> knl = lp.prioritize_loops(knl, " i_outer,j_outer,i_inner" )
613+ >>> knl = lp.prioritize_loops(knl, " i_outer,j_outer,i_inner,j_inner " )
614614 >>> knl = lp.set_options(knl, " write_code" )
615615 >>> evt, (out,) = knl(queue, a = a_mat_dev)
616616 #define lid(N) ((int) get_local_id(N))
@@ -1029,8 +1029,8 @@ transformation exists in :func:`loopy.add_prefetch`:
10291029 >>> evt, (out,) = knl_pf(queue, a = x_vec_dev)
10301030 #define lid(N) ((int) get_local_id(N))
10311031 ...
1032- acc_k = 0.0f;
10331032 a_fetch = a[16 * gid(0) + lid(0)];
1033+ acc_k = 0.0f;
10341034 for (int k = 0; k <= 15; ++k)
10351035 acc_k = acc_k + a_fetch;
10361036 out[16 * gid(0) + lid(0)] = acc_k;
@@ -1053,10 +1053,10 @@ earlier:
10531053 >>> evt, (out,) = knl_pf(queue, a = x_vec_dev)
10541054 #define lid(N) ((int) get_local_id(N))
10551055 ...
1056- if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
1057- acc_k = 0.0f;
10581056 if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
10591057 a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
1058+ if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
1059+ acc_k = 0.0f;
10601060 barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
10611061 if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
10621062 {
@@ -1908,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers:
19081908 {
19091909 __local int c[50 * 10 * 99];
19101910 <BLANKLINE>
1911- {
1912- int const k_outer = 0;
1913- <BLANKLINE>
1911+ for (int i = 0; i <= 49; ++i)
19141912 for (int j = 0; j <= 9; ++j)
1915- for (int i = 0; i <= 49; ++i)
1916- {
1917- barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
1918- c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1] ;
1919- barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */ ;
1920- e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1] ;
1921- }
1922- }
1913+ {
1914+ int const k_outer = 0;
1915+ <BLANKLINE>
1916+ barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */ ;
1917+ c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1] ;
1918+ barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */ ;
1919+ e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
1920+ }
19231921 }
19241922
19251923In this kernel, when a work-item performs the second instruction it uses data
0 commit comments