• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Mesa 3-D graphics library
3  *
4  * Copyright (C) 2012-2015 LunarG, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  * DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Chia-I Wu <olv@lunarg.com>
26  */
27 
28 #include "ilo_debug.h"
29 #include "ilo_state_compute.h"
30 
31 struct compute_urb_configuration {
32    int idrt_entry_count;
33    int curbe_entry_count;
34 
35    int urb_entry_count;
36    /* in 256-bit register increments */
37    int urb_entry_size;
38 };
39 
40 static int
get_gen6_rob_entry_count(const struct ilo_dev * dev)41 get_gen6_rob_entry_count(const struct ilo_dev *dev)
42 {
43    ILO_DEV_ASSERT(dev, 6, 8);
44 
45    /*
46     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
47     *
48     *     "ROB has 64KB of storage; 2048 entries."
49     *
50     * From the valid ranges of "CURBE Allocation Size", we can also conclude
51     * that interface entries and CURBE data must be in ROB.  And that ROB
52     * should be 16KB, or 512 entries, on Gen7 GT1.
53     */
54    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
55       return 2048;
56    else if (ilo_dev_gen(dev) >= ILO_GEN(7))
57       return (dev->gt == 2) ? 2048 : 512;
58    else
59       return (dev->gt == 2) ? 2048 : 1024;
60 }
61 
62 static int
get_gen6_idrt_entry_count(const struct ilo_dev * dev)63 get_gen6_idrt_entry_count(const struct ilo_dev *dev)
64 {
65    ILO_DEV_ASSERT(dev, 6, 8);
66 
67    /*
68     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
69     *
70     *     "The first 32 URB entries are reserved for the interface
71     *      descriptor..."
72     *
73     * From the Haswell PRM, volume 7, page 836:
74     *
75     *     "The first 64 URB entries are reserved for the interface
76     *      description..."
77     */
78    return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
79 }
80 
81 static int
get_gen6_curbe_entry_count(const struct ilo_dev * dev,uint32_t curbe_size)82 get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
83 {
84    /*
85     * From the Ivy Bridge PRM, volume 2 part 2, page 21:
86     *
87     *     "(CURBE Allocation Size) Specifies the total length allocated for
88     *      CURBE, in 256-bit register increments.
89     */
90    const int entry_count = (curbe_size + 31) / 32;
91 
92    ILO_DEV_ASSERT(dev, 6, 8);
93 
94    assert(get_gen6_idrt_entry_count(dev) + entry_count <=
95          get_gen6_rob_entry_count(dev));
96 
97    return entry_count;
98 }
99 
100 static bool
compute_get_gen6_urb_configuration(const struct ilo_dev * dev,const struct ilo_state_compute_info * info,struct compute_urb_configuration * urb)101 compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
102                                    const struct ilo_state_compute_info *info,
103                                    struct compute_urb_configuration *urb)
104 {
105    ILO_DEV_ASSERT(dev, 6, 8);
106 
107    urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
108    urb->curbe_entry_count =
109       get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
110 
111    /*
112     * From the Broadwell PRM, volume 2b, page 451:
113     *
114     *     "Please note that 0 is not allowed for this field (Number of URB
115     *      Entries)."
116     */
117    urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
118 
119    /*
120     * From the Ivy Bridge PRM, volume 2 part 2, page 52:
121     *
122     *     "(URB Entry Allocation Size) Specifies the length of each URB entry
123     *      used by the unit, in 256-bit register increments - 1."
124     */
125    urb->urb_entry_size = 1;
126 
127    /*
128     * From the Ivy Bridge PRM, volume 2 part 2, page 22:
129     *
130     *      MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
131     *      size and the number of URB handles. The driver must ensure that
132     *      ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
133     *      URB_allocation_in_L3."
134     */
135    assert(urb->idrt_entry_count + urb->curbe_entry_count +
136          urb->urb_entry_count * urb->urb_entry_size <=
137          info->cv_urb_alloc_size / 32);
138 
139    return true;
140 }
141 
142 static int
compute_interface_get_gen6_read_end(const struct ilo_dev * dev,const struct ilo_state_compute_interface_info * interface)143 compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
144                                     const struct ilo_state_compute_interface_info *interface)
145 {
146    const int per_thread_read = (interface->curbe_read_length + 31) / 32;
147    const int cross_thread_read =
148       (interface->cross_thread_curbe_read_length + 31) / 32;
149 
150    ILO_DEV_ASSERT(dev, 6, 8);
151 
152    assert(interface->curbe_read_offset % 32 == 0);
153 
154    /*
155     * From the Ivy Bridge PRM, volume 2 part 2, page 60:
156     *
157     *     "(Constant URB Entry Read Length) [0,63]"
158     */
159    assert(per_thread_read <= 63);
160 
161    /*
162     * From the Haswell PRM, volume 2d, page 199:
163     *
164     *     "(Cross-Thread Constant Data Read Length) [0,127]"
165     */
166    if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
167       assert(cross_thread_read <= 127);
168    else
169       assert(!cross_thread_read);
170 
171    if (per_thread_read || cross_thread_read) {
172       return interface->curbe_read_offset / 32 + cross_thread_read +
173          per_thread_read * interface->thread_group_size;
174    } else {
175       return 0;
176    }
177 }
178 
179 static bool
compute_validate_gen6(const struct ilo_dev * dev,const struct ilo_state_compute_info * info,const struct compute_urb_configuration * urb)180 compute_validate_gen6(const struct ilo_dev *dev,
181                       const struct ilo_state_compute_info *info,
182                       const struct compute_urb_configuration *urb)
183 {
184    int min_curbe_entry_count;
185    uint8_t i;
186 
187    ILO_DEV_ASSERT(dev, 6, 8);
188 
189    assert(info->interface_count <= urb->idrt_entry_count);
190 
191    min_curbe_entry_count = 0;
192    for (i = 0; i < info->interface_count; i++) {
193       const int read_end =
194          compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
195 
196       if (min_curbe_entry_count < read_end)
197          min_curbe_entry_count = read_end;
198    }
199 
200    assert(min_curbe_entry_count <= urb->curbe_entry_count);
201 
202    /*
203     * From the Broadwell PRM, volume 2b, page 452:
204     *
205     *     "CURBE Allocation Size should be 0 for GPGPU workloads that uses
206     *      indirect instead of CURBE."
207     */
208    if (!min_curbe_entry_count)
209       assert(!urb->curbe_entry_count);
210 
211    return true;
212 }
213 
214 static uint32_t
compute_get_gen6_per_thread_scratch_size(const struct ilo_dev * dev,const struct ilo_state_compute_info * info,uint8_t * per_thread_space)215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
216                                          const struct ilo_state_compute_info *info,
217                                          uint8_t *per_thread_space)
218 {
219    ILO_DEV_ASSERT(dev, 6, 7);
220 
221    /*
222     * From the Sandy Bridge PRM, volume 2 part 2, page 30:
223     *
224     *     "(Per Thread Scratch Space)
225     *      Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
226     */
227    assert(info->per_thread_scratch_size <= 12 * 1024);
228 
229    if (!info->per_thread_scratch_size) {
230       *per_thread_space = 0;
231       return 0;
232    }
233 
234    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
235       (info->per_thread_scratch_size - 1) / 1024 : 0;
236 
237    return 1024 * (1 + *per_thread_space);
238 }
239 
240 static uint32_t
compute_get_gen75_per_thread_scratch_size(const struct ilo_dev * dev,const struct ilo_state_compute_info * info,uint8_t * per_thread_space)241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
242                                           const struct ilo_state_compute_info *info,
243                                           uint8_t *per_thread_space)
244 {
245    ILO_DEV_ASSERT(dev, 7.5, 8);
246 
247    /*
248     * From the Haswell PRM, volume 2b, page 407:
249     *
250     *     "(Per Thread Scratch Space)
251     *      [0,10]  Indicating [2k bytes, 2 Mbytes]"
252     *
253     *     "Note: The scratch space should be declared as 2x the desired
254     *      scratch space. The stack will start at the half-way point instead
255     *      of the end. The upper half of scratch space will not be accessed
256     *      and so does not have to be allocated in memory."
257     *
258     * From the Broadwell PRM, volume 2a, page 450:
259     *
260     *     "(Per Thread Scratch Space)
261     *      [0,11]  indicating [1k bytes, 2 Mbytes]"
262     */
263    assert(info->per_thread_scratch_size <=
264          ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
265 
266    if (!info->per_thread_scratch_size) {
267       *per_thread_space = 0;
268       return 0;
269    }
270 
271    /* next power of two, starting from 1KB */
272    *per_thread_space = (info->per_thread_scratch_size > 1024) ?
273       (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
274 
275    return 1 << (10 + *per_thread_space);
276 }
277 
278 static bool
compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute * compute,const struct ilo_dev * dev,const struct ilo_state_compute_info * info)279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
280                                  const struct ilo_dev *dev,
281                                  const struct ilo_state_compute_info *info)
282 {
283    struct compute_urb_configuration urb;
284    uint32_t per_thread_size;
285    uint8_t per_thread_space;
286 
287    uint32_t dw1, dw2, dw4;
288 
289    ILO_DEV_ASSERT(dev, 6, 8);
290 
291    if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
292        !compute_validate_gen6(dev, info, &urb))
293       return false;
294 
295    if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
296       per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
297             info, &per_thread_space);
298    } else {
299       per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
300             info, &per_thread_space);
301    }
302 
303    dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
304 
305    dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
306          urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
307          GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
308          GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
309 
310    if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
311       dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
312 
313    assert(urb.urb_entry_size);
314 
315    dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
316          urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
317 
318    STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
319    compute->vfe[0] = dw1;
320    compute->vfe[1] = dw2;
321    compute->vfe[2] = dw4;
322 
323    compute->scratch_size = per_thread_size * dev->thread_count;
324 
325    return true;
326 }
327 
328 static uint8_t
compute_interface_get_gen6_sampler_count(const struct ilo_dev * dev,const struct ilo_state_compute_interface_info * interface)329 compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
330                                          const struct ilo_state_compute_interface_info *interface)
331 {
332    ILO_DEV_ASSERT(dev, 6, 8);
333    return (interface->sampler_count <= 12) ?
334       (interface->sampler_count + 3) / 4 : 4;
335 }
336 
337 static uint8_t
compute_interface_get_gen6_surface_count(const struct ilo_dev * dev,const struct ilo_state_compute_interface_info * interface)338 compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
339                                          const struct ilo_state_compute_interface_info *interface)
340 {
341    ILO_DEV_ASSERT(dev, 6, 8);
342    return (interface->surface_count <= 31) ? interface->surface_count : 31;
343 }
344 
345 static uint8_t
compute_interface_get_gen7_slm_size(const struct ilo_dev * dev,const struct ilo_state_compute_interface_info * interface)346 compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
347                                     const struct ilo_state_compute_interface_info *interface)
348 {
349    ILO_DEV_ASSERT(dev, 7, 8);
350 
351    /*
352     * From the Ivy Bridge PRM, volume 2 part 2, page 61:
353     *
354     *     "The amount is specified in 4k blocks, but only powers of 2 are
355     *      allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
356     */
357    assert(interface->slm_size <= 64 * 1024);
358 
359    return util_next_power_of_two((interface->slm_size + 4095) / 4096);
360 }
361 
362 static bool
compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute * compute,const struct ilo_dev * dev,const struct ilo_state_compute_info * info)363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
364                                            const struct ilo_dev *dev,
365                                            const struct ilo_state_compute_info *info)
366 {
367    uint8_t i;
368 
369    ILO_DEV_ASSERT(dev, 6, 8);
370 
371    for (i = 0; i < info->interface_count; i++) {
372       const struct ilo_state_compute_interface_info *interface =
373          &info->interfaces[i];
374       uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
375       uint8_t sampler_count, surface_count;
376       uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
377 
378       assert(interface->kernel_offset % 64 == 0);
379       assert(interface->thread_group_size);
380 
381       read_offset = interface->curbe_read_offset / 32;
382       per_thread_read_len = (interface->curbe_read_length + 31) / 32;
383       cross_thread_read_len =
384          (interface->cross_thread_curbe_read_length + 31) / 32;
385 
386       sampler_count =
387          compute_interface_get_gen6_sampler_count(dev, interface);
388       surface_count =
389          compute_interface_get_gen6_surface_count(dev, interface);
390 
391       dw0 = interface->kernel_offset;
392       dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
393       dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
394       dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
395             read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
396 
397       dw5 = 0;
398       dw6 = 0;
399       if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
400          const uint8_t slm_size =
401             compute_interface_get_gen7_slm_size(dev, interface);
402 
403          dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
404 
405          if (slm_size) {
406             dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
407                    slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
408          }
409 
410          /*
411           * From the Haswell PRM, volume 2d, page 199:
412           *
413           *     "(Number of Threads in GPGPU Thread Group) Specifies the
414           *      number of threads that are in this thread group.  Used to
415           *      program the barrier for the number of messages to expect. The
416           *      minimum value is 0 (which will disable the barrier), while
417           *      the maximum value is the number of threads in a subslice for
418           *      local barriers."
419           *
420           * From the Broadwell PRM, volume 2d, page 183:
421           *
422           *     "(Number of Threads in GPGPU Thread Group) Specifies the
423           *      number of threads that are in this thread group.  The minimum
424           *      value is 1, while the maximum value is the number of threads
425           *      in a subslice for local barriers. See vol1b Configurations
426           *      for the number of threads per subslice for different
427           *      products.  The maximum value for global barriers is limited
428           *      by the number of threads in the system, or by 511, whichever
429           *      is lower. This field should not be set to 0 even if the
430           *      barrier is disabled, since an accurate value is needed for
431           *      proper pre-emption."
432           */
433          if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
434             dw5 |= interface->thread_group_size <<
435                GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
436          }
437 
438          if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
439             dw6 |= cross_thread_read_len <<
440                GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
441          }
442       }
443 
444       STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
445       compute->idrt[i][0] = dw0;
446       compute->idrt[i][1] = dw2;
447       compute->idrt[i][2] = dw3;
448       compute->idrt[i][3] = dw4;
449       compute->idrt[i][4] = dw5;
450       compute->idrt[i][5] = dw6;
451    }
452 
453    return true;
454 }
455 
456 bool
ilo_state_compute_init(struct ilo_state_compute * compute,const struct ilo_dev * dev,const struct ilo_state_compute_info * info)457 ilo_state_compute_init(struct ilo_state_compute *compute,
458                        const struct ilo_dev *dev,
459                        const struct ilo_state_compute_info *info)
460 {
461    bool ret = true;
462 
463    assert(ilo_is_zeroed(compute, sizeof(*compute)));
464    assert(ilo_is_zeroed(info->data, info->data_size));
465 
466    assert(ilo_state_compute_data_size(dev, info->interface_count) <=
467          info->data_size);
468    compute->idrt = (uint32_t (*)[6]) info->data;
469 
470    ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
471    ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
472 
473    assert(ret);
474 
475    return ret;
476 }
477