Skip to content

Commit e20c6ca

Browse files
committed
changes in docs to account for equivalent generated codes from the same LoopKernel
1 parent e0714d1 commit e20c6ca

1 file changed

Lines changed: 13 additions & 15 deletions

File tree

doc/tutorial.rst

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,7 @@ commonly called 'loop tiling':
610610
... assumptions="n mod 16 = 0 and n >= 1")
611611
>>> knl = lp.split_iname(knl, "i", 16)
612612
>>> knl = lp.split_iname(knl, "j", 16)
613-
>>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner")
613+
>>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner")
614614
>>> knl = lp.set_options(knl, "write_code")
615615
>>> evt, (out,) = knl(queue, a=a_mat_dev)
616616
#define lid(N) ((int) get_local_id(N))
@@ -1029,8 +1029,8 @@ transformation exists in :func:`loopy.add_prefetch`:
10291029
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
10301030
#define lid(N) ((int) get_local_id(N))
10311031
...
1032-
acc_k = 0.0f;
10331032
a_fetch = a[16 * gid(0) + lid(0)];
1033+
acc_k = 0.0f;
10341034
for (int k = 0; k <= 15; ++k)
10351035
acc_k = acc_k + a_fetch;
10361036
out[16 * gid(0) + lid(0)] = acc_k;
@@ -1053,10 +1053,10 @@ earlier:
10531053
>>> evt, (out,) = knl_pf(queue, a=x_vec_dev)
10541054
#define lid(N) ((int) get_local_id(N))
10551055
...
1056-
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
1057-
acc_k = 0.0f;
10581056
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
10591057
a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
1058+
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
1059+
acc_k = 0.0f;
10601060
barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
10611061
if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
10621062
{
@@ -1908,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers:
19081908
{
19091909
__local int c[50 * 10 * 99];
19101910
<BLANKLINE>
1911-
{
1912-
int const k_outer = 0;
1913-
<BLANKLINE>
1911+
for (int i = 0; i <= 49; ++i)
19141912
for (int j = 0; j <= 9; ++j)
1915-
for (int i = 0; i <= 49; ++i)
1916-
{
1917-
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
1918-
c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
1919-
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
1920-
e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
1921-
}
1922-
}
1913+
{
1914+
int const k_outer = 0;
1915+
<BLANKLINE>
1916+
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */;
1917+
c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1];
1918+
barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */;
1919+
e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1];
1920+
}
19231921
}
19241922

19251923
In this kernel, when a work-item performs the second instruction it uses data

0 commit comments

Comments
 (0)