• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 
30 #include "anv_private.h"
31 
32 #include "util/os_time.h"
33 
34 #include "genxml/gen_macros.h"
35 #include "genxml/genX_pack.h"
36 
37 /* We reserve :
38  *    - GPR 14 for perf queries
39  *    - GPR 15 for conditional rendering
40  */
41 #define MI_BUILDER_NUM_ALLOC_GPRS 14
42 #define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
43 #define __gen_get_batch_dwords anv_batch_emit_dwords
44 #define __gen_address_offset anv_address_add
45 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
46 #include "common/mi_builder.h"
47 #include "perf/intel_perf.h"
48 #include "perf/intel_perf_mdapi.h"
49 #include "perf/intel_perf_regs.h"
50 
51 #include "vk_util.h"
52 
53 static struct anv_address
anv_query_address(struct anv_query_pool * pool,uint32_t query)54 anv_query_address(struct anv_query_pool *pool, uint32_t query)
55 {
56    return (struct anv_address) {
57       .bo = pool->bo,
58       .offset = query * pool->stride,
59    };
60 }
61 
genX(CreateQueryPool)62 VkResult genX(CreateQueryPool)(
63     VkDevice                                    _device,
64     const VkQueryPoolCreateInfo*                pCreateInfo,
65     const VkAllocationCallbacks*                pAllocator,
66     VkQueryPool*                                pQueryPool)
67 {
68    ANV_FROM_HANDLE(anv_device, device, _device);
69    const struct anv_physical_device *pdevice = device->physical;
70 #if GFX_VER >= 8
71    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
72    struct intel_perf_counter_pass *counter_pass;
73    struct intel_perf_query_info **pass_query;
74    uint32_t n_passes = 0;
75 #endif
76    uint32_t data_offset = 0;
77    VK_MULTIALLOC(ma);
78    VkResult result;
79 
80    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
81 
82    /* Query pool slots are made up of some number of 64-bit values packed
83     * tightly together. For most query types have the first 64-bit value is
84     * the "available" bit which is 0 when the query is unavailable and 1 when
85     * it is available. The 64-bit values that follow are determined by the
86     * type of query.
87     *
88     * For performance queries, we have a requirement to align OA reports at
89     * 64bytes so we put those first and have the "available" bit behind
90     * together with some other counters.
91     */
92    uint32_t uint64s_per_slot = 0;
93 
94    VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1);
95 
96    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
97    switch (pCreateInfo->queryType) {
98    case VK_QUERY_TYPE_OCCLUSION:
99       /* Occlusion queries have two values: begin and end. */
100       uint64s_per_slot = 1 + 2;
101       break;
102    case VK_QUERY_TYPE_TIMESTAMP:
103       /* Timestamps just have the one timestamp value */
104       uint64s_per_slot = 1 + 1;
105       break;
106    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
107       pipeline_statistics = pCreateInfo->pipelineStatistics;
108       /* We're going to trust this field implicitly so we need to ensure that
109        * no unhandled extension bits leak in.
110        */
111       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
112 
113       /* Statistics queries have a min and max for every statistic */
114       uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
115       break;
116    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
117       /* Transform feedback queries are 4 values, begin/end for
118        * written/available.
119        */
120       uint64s_per_slot = 1 + 4;
121       break;
122    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
123       const struct intel_perf_query_field_layout *layout =
124          &pdevice->perf->query_layout;
125 
126       uint64s_per_slot = 2; /* availability + marker */
127       /* Align to the requirement of the layout */
128       uint64s_per_slot = align_u32(uint64s_per_slot,
129                                    DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
130       data_offset = uint64s_per_slot * sizeof(uint64_t);
131       /* Add the query data for begin & end commands */
132       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
133       break;
134    }
135 #if GFX_VER >= 8
136    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
137       const struct intel_perf_query_field_layout *layout =
138          &pdevice->perf->query_layout;
139 
140       perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
141                                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
142       n_passes = intel_perf_get_n_passes(pdevice->perf,
143                                          perf_query_info->pCounterIndices,
144                                          perf_query_info->counterIndexCount,
145                                          NULL);
146       vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass,
147                              perf_query_info->counterIndexCount);
148       vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
149                              n_passes);
150       uint64s_per_slot = 4 /* availability + small batch */;
151       /* Align to the requirement of the layout */
152       uint64s_per_slot = align_u32(uint64s_per_slot,
153                                    DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
154       data_offset = uint64s_per_slot * sizeof(uint64_t);
155       /* Add the query data for begin & end commands */
156       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
157       /* Multiply by the number of passes */
158       uint64s_per_slot *= n_passes;
159       break;
160    }
161 #endif
162    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
163       /* Query has two values: begin and end. */
164       uint64s_per_slot = 1 + 2;
165       break;
166    default:
167       assert(!"Invalid query type");
168    }
169 
170    if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
171                              VK_OBJECT_TYPE_QUERY_POOL))
172       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
173 
174    pool->type = pCreateInfo->queryType;
175    pool->pipeline_statistics = pipeline_statistics;
176    pool->stride = uint64s_per_slot * sizeof(uint64_t);
177    pool->slots = pCreateInfo->queryCount;
178 
179    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
180       pool->data_offset = data_offset;
181       pool->snapshot_size = (pool->stride - data_offset) / 2;
182    }
183 #if GFX_VER >= 8
184    else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
185       pool->pass_size = pool->stride / n_passes;
186       pool->data_offset = data_offset;
187       pool->snapshot_size = (pool->pass_size - data_offset) / 2;
188       pool->n_counters = perf_query_info->counterIndexCount;
189       pool->counter_pass = counter_pass;
190       intel_perf_get_counters_passes(pdevice->perf,
191                                      perf_query_info->pCounterIndices,
192                                      perf_query_info->counterIndexCount,
193                                      pool->counter_pass);
194       pool->n_passes = n_passes;
195       pool->pass_query = pass_query;
196       intel_perf_get_n_passes(pdevice->perf,
197                               perf_query_info->pCounterIndices,
198                               perf_query_info->counterIndexCount,
199                               pool->pass_query);
200    }
201 #endif
202 
203    uint64_t size = pool->slots * (uint64_t)pool->stride;
204    result = anv_device_alloc_bo(device, "query-pool", size,
205                                 ANV_BO_ALLOC_MAPPED |
206                                 ANV_BO_ALLOC_SNOOPED,
207                                 0 /* explicit_address */,
208                                 &pool->bo);
209    if (result != VK_SUCCESS)
210       goto fail;
211 
212 #if GFX_VER >= 8
213    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
214       for (uint32_t p = 0; p < pool->n_passes; p++) {
215          struct mi_builder b;
216          struct anv_batch batch = {
217             .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p),
218             .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset,
219          };
220          batch.next = batch.start;
221 
222          mi_builder_init(&b, &device->info, &batch);
223          mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
224                       mi_imm(p * (uint64_t)pool->pass_size));
225          anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
226       }
227    }
228 #endif
229 
230    *pQueryPool = anv_query_pool_to_handle(pool);
231 
232    return VK_SUCCESS;
233 
234  fail:
235    vk_free2(&device->vk.alloc, pAllocator, pool);
236 
237    return result;
238 }
239 
genX(DestroyQueryPool)240 void genX(DestroyQueryPool)(
241     VkDevice                                    _device,
242     VkQueryPool                                 _pool,
243     const VkAllocationCallbacks*                pAllocator)
244 {
245    ANV_FROM_HANDLE(anv_device, device, _device);
246    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
247 
248    if (!pool)
249       return;
250 
251    anv_device_release_bo(device, pool->bo);
252    vk_object_free(&device->vk, pAllocator, pool);
253 }
254 
255 #if GFX_VER >= 8
256 /**
257  * VK_KHR_performance_query layout  :
258  *
259  * --------------------------------------------
260  * |       availability (8b)       | |        |
261  * |-------------------------------| |        |
262  * |      Small batch loading      | |        |
263  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
264  * |            (24b)              | | Pass 0 |
265  * |-------------------------------| |        |
266  * |       some padding (see       | |        |
267  * | query_field_layout:alignment) | |        |
268  * |-------------------------------| |        |
269  * |           query data          | |        |
270  * | (2 * query_field_layout:size) | |        |
271  * |-------------------------------|--        | Query 0
272  * |       availability (8b)       | |        |
273  * |-------------------------------| |        |
274  * |      Small batch loading      | |        |
275  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
276  * |            (24b)              | | Pass 1 |
277  * |-------------------------------| |        |
278  * |       some padding (see       | |        |
279  * | query_field_layout:alignment) | |        |
280  * |-------------------------------| |        |
281  * |           query data          | |        |
282  * | (2 * query_field_layout:size) | |        |
283  * |-------------------------------|-----------
284  * |       availability (8b)       | |        |
285  * |-------------------------------| |        |
286  * |      Small batch loading      | |        |
287  * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
288  * |            (24b)              | | Pass 0 |
289  * |-------------------------------| |        |
290  * |       some padding (see       | |        |
291  * | query_field_layout:alignment) | |        |
292  * |-------------------------------| |        |
293  * |           query data          | |        |
294  * | (2 * query_field_layout:size) | |        |
295  * |-------------------------------|--        | Query 1
296  * |               ...             | |        |
297  * --------------------------------------------
298  */
299 
300 static uint64_t
khr_perf_query_availability_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass)301 khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
302 {
303    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size;
304 }
305 
306 static uint64_t
khr_perf_query_data_offset(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)307 khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
308 {
309    return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size +
310       pool->data_offset + (end ? pool->snapshot_size : 0);
311 }
312 
313 static struct anv_address
khr_perf_query_availability_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass)314 khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
315 {
316    return anv_address_add(
317       (struct anv_address) { .bo = pool->bo, },
318       khr_perf_query_availability_offset(pool, query, pass));
319 }
320 
321 static struct anv_address
khr_perf_query_data_address(struct anv_query_pool * pool,uint32_t query,uint32_t pass,bool end)322 khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
323 {
324    return anv_address_add(
325       (struct anv_address) { .bo = pool->bo, },
326       khr_perf_query_data_offset(pool, query, pass, end));
327 }
328 
329 static bool
khr_perf_query_ensure_relocs(struct anv_cmd_buffer * cmd_buffer)330 khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
331 {
332    if (anv_batch_has_error(&cmd_buffer->batch))
333       return false;
334 
335    if (cmd_buffer->self_mod_locations)
336       return true;
337 
338    struct anv_device *device = cmd_buffer->device;
339    const struct anv_physical_device *pdevice = device->physical;
340 
341    cmd_buffer->self_mod_locations =
342       vk_alloc(&cmd_buffer->vk.pool->alloc,
343                pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
344                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
345 
346    if (!cmd_buffer->self_mod_locations) {
347       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
348       return false;
349    }
350 
351    return true;
352 }
353 #endif
354 
355 /**
356  * VK_INTEL_performance_query layout :
357  *
358  * ---------------------------------
359  * |       availability (8b)       |
360  * |-------------------------------|
361  * |          marker (8b)          |
362  * |-------------------------------|
363  * |       some padding (see       |
364  * | query_field_layout:alignment) |
365  * |-------------------------------|
366  * |           query data          |
367  * | (2 * query_field_layout:size) |
368  * ---------------------------------
369  */
370 
371 static uint32_t
intel_perf_marker_offset(void)372 intel_perf_marker_offset(void)
373 {
374    return 8;
375 }
376 
377 static uint32_t
intel_perf_query_data_offset(struct anv_query_pool * pool,bool end)378 intel_perf_query_data_offset(struct anv_query_pool *pool, bool end)
379 {
380    return pool->data_offset + (end ? pool->snapshot_size : 0);
381 }
382 
383 static void
cpu_write_query_result(void * dst_slot,VkQueryResultFlags flags,uint32_t value_index,uint64_t result)384 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
385                        uint32_t value_index, uint64_t result)
386 {
387    if (flags & VK_QUERY_RESULT_64_BIT) {
388       uint64_t *dst64 = dst_slot;
389       dst64[value_index] = result;
390    } else {
391       uint32_t *dst32 = dst_slot;
392       dst32[value_index] = result;
393    }
394 }
395 
396 static void *
query_slot(struct anv_query_pool * pool,uint32_t query)397 query_slot(struct anv_query_pool *pool, uint32_t query)
398 {
399    return pool->bo->map + query * pool->stride;
400 }
401 
402 static bool
query_is_available(struct anv_query_pool * pool,uint32_t query)403 query_is_available(struct anv_query_pool *pool, uint32_t query)
404 {
405 #if GFX_VER >= 8
406    if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
407       for (uint32_t p = 0; p < pool->n_passes; p++) {
408          volatile uint64_t *slot =
409             pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
410          if (!slot[0])
411             return false;
412       }
413       return true;
414    }
415 #endif
416 
417    return *(volatile uint64_t *)query_slot(pool, query);
418 }
419 
420 static VkResult
wait_for_available(struct anv_device * device,struct anv_query_pool * pool,uint32_t query)421 wait_for_available(struct anv_device *device,
422                    struct anv_query_pool *pool, uint32_t query)
423 {
424    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(2 * NSEC_PER_SEC);
425 
426    while (os_time_get_nano() < abs_timeout_ns) {
427       if (query_is_available(pool, query))
428          return VK_SUCCESS;
429       VkResult status = vk_device_check_status(&device->vk);
430       if (status != VK_SUCCESS)
431          return status;
432    }
433 
434    return vk_device_set_lost(&device->vk, "query timeout");
435 }
436 
genX(GetQueryPoolResults)437 VkResult genX(GetQueryPoolResults)(
438     VkDevice                                    _device,
439     VkQueryPool                                 queryPool,
440     uint32_t                                    firstQuery,
441     uint32_t                                    queryCount,
442     size_t                                      dataSize,
443     void*                                       pData,
444     VkDeviceSize                                stride,
445     VkQueryResultFlags                          flags)
446 {
447    ANV_FROM_HANDLE(anv_device, device, _device);
448    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
449 
450    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
451           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
452           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
453           pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
454           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
455           pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
456           pool->type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT);
457 
458    if (vk_device_is_lost(&device->vk))
459       return VK_ERROR_DEVICE_LOST;
460 
461    if (pData == NULL)
462       return VK_SUCCESS;
463 
464    void *data_end = pData + dataSize;
465 
466    VkResult status = VK_SUCCESS;
467    for (uint32_t i = 0; i < queryCount; i++) {
468       bool available = query_is_available(pool, firstQuery + i);
469 
470       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
471          status = wait_for_available(device, pool, firstQuery + i);
472          if (status != VK_SUCCESS) {
473             return status;
474          }
475 
476          available = true;
477       }
478 
479       /* From the Vulkan 1.0.42 spec:
480        *
481        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
482        *    both not set then no result values are written to pData for
483        *    queries that are in the unavailable state at the time of the call,
484        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
485        *    availability state is still written to pData for those queries if
486        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
487        *
488        * From VK_KHR_performance_query :
489        *
490        *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
491        *     that the result should contain the number of counters that were recorded
492        *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
493        */
494       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
495 
496       uint32_t idx = 0;
497       switch (pool->type) {
498       case VK_QUERY_TYPE_OCCLUSION:
499       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
500          uint64_t *slot = query_slot(pool, firstQuery + i);
501          if (write_results) {
502             /* From the Vulkan 1.2.132 spec:
503              *
504              *    "If VK_QUERY_RESULT_PARTIAL_BIT is set,
505              *    VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
506              *    is unavailable, an intermediate result value between zero and
507              *    the final result value is written to pData for that query."
508              */
509             uint64_t result = available ? slot[2] - slot[1] : 0;
510             cpu_write_query_result(pData, flags, idx, result);
511          }
512          idx++;
513          break;
514       }
515 
516       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
517          uint64_t *slot = query_slot(pool, firstQuery + i);
518          uint32_t statistics = pool->pipeline_statistics;
519          while (statistics) {
520             uint32_t stat = u_bit_scan(&statistics);
521             if (write_results) {
522                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
523 
524                /* WaDividePSInvocationCountBy4:HSW,BDW */
525                if ((device->info.ver == 8 || device->info.verx10 == 75) &&
526                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
527                   result >>= 2;
528 
529                cpu_write_query_result(pData, flags, idx, result);
530             }
531             idx++;
532          }
533          assert(idx == util_bitcount(pool->pipeline_statistics));
534          break;
535       }
536 
537       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
538          uint64_t *slot = query_slot(pool, firstQuery + i);
539          if (write_results)
540             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
541          idx++;
542          if (write_results)
543             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
544          idx++;
545          break;
546       }
547 
548       case VK_QUERY_TYPE_TIMESTAMP: {
549          uint64_t *slot = query_slot(pool, firstQuery + i);
550          if (write_results)
551             cpu_write_query_result(pData, flags, idx, slot[1]);
552          idx++;
553          break;
554       }
555 
556 #if GFX_VER >= 8
557       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
558          const struct anv_physical_device *pdevice = device->physical;
559          assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
560                           VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
561          for (uint32_t p = 0; p < pool->n_passes; p++) {
562             const struct intel_perf_query_info *query = pool->pass_query[p];
563             struct intel_perf_query_result result;
564             intel_perf_query_result_clear(&result);
565             intel_perf_query_result_accumulate_fields(&result, query,
566                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
567                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
568                                                       false /* no_oa_accumulate */);
569             anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
570          }
571          break;
572       }
573 #endif
574 
575       case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
576          if (!write_results)
577             break;
578          const void *query_data = query_slot(pool, firstQuery + i);
579          const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
580          struct intel_perf_query_result result;
581          intel_perf_query_result_clear(&result);
582          intel_perf_query_result_accumulate_fields(&result, query,
583                                                    query_data + intel_perf_query_data_offset(pool, false),
584                                                    query_data + intel_perf_query_data_offset(pool, true),
585                                                    false /* no_oa_accumulate */);
586          intel_perf_query_result_write_mdapi(pData, stride,
587                                              &device->info,
588                                              query, &result);
589          const uint64_t *marker = query_data + intel_perf_marker_offset();
590          intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
591          break;
592       }
593 
594       default:
595          unreachable("invalid pool type");
596       }
597 
598       if (!write_results)
599          status = VK_NOT_READY;
600 
601       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
602          cpu_write_query_result(pData, flags, idx, available);
603 
604       pData += stride;
605       if (pData >= data_end)
606          break;
607    }
608 
609    return status;
610 }
611 
612 static void
emit_ps_depth_count(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)613 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
614                     struct anv_address addr)
615 {
616    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
617    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
618 
619    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
620       pc.DestinationAddressType  = DAT_PPGTT;
621       pc.PostSyncOperation       = WritePSDepthCount;
622       pc.DepthStallEnable        = true;
623       pc.Address                 = addr;
624 
625       if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
626          pc.CommandStreamerStallEnable = true;
627    }
628 }
629 
630 static void
emit_query_mi_availability(struct mi_builder * b,struct anv_address addr,bool available)631 emit_query_mi_availability(struct mi_builder *b,
632                            struct anv_address addr,
633                            bool available)
634 {
635    mi_store(b, mi_mem64(addr), mi_imm(available));
636 }
637 
638 static void
emit_query_pc_availability(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool available)639 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
640                            struct anv_address addr,
641                            bool available)
642 {
643    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
644    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
645 
646    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
647       pc.DestinationAddressType  = DAT_PPGTT;
648       pc.PostSyncOperation       = WriteImmediateData;
649       pc.Address                 = addr;
650       pc.ImmediateData           = available;
651    }
652 }
653 
654 /**
655  * Goes through a series of consecutive query indices in the given pool
656  * setting all element values to 0 and emitting them as available.
657  */
658 static void
emit_zero_queries(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_query_pool * pool,uint32_t first_index,uint32_t num_queries)659 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
660                   struct mi_builder *b, struct anv_query_pool *pool,
661                   uint32_t first_index, uint32_t num_queries)
662 {
663    switch (pool->type) {
664    case VK_QUERY_TYPE_OCCLUSION:
665    case VK_QUERY_TYPE_TIMESTAMP:
666       /* These queries are written with a PIPE_CONTROL so clear them using the
667        * PIPE_CONTROL as well so we don't have to synchronize between 2 types
668        * of operations.
669        */
670       assert((pool->stride % 8) == 0);
671       for (uint32_t i = 0; i < num_queries; i++) {
672          struct anv_address slot_addr =
673             anv_query_address(pool, first_index + i);
674 
675          for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
676             emit_query_pc_availability(cmd_buffer,
677                                        anv_address_add(slot_addr, qword * 8),
678                                        false);
679          }
680          emit_query_pc_availability(cmd_buffer, slot_addr, true);
681       }
682       break;
683 
684    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
685    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
686    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
687       for (uint32_t i = 0; i < num_queries; i++) {
688          struct anv_address slot_addr =
689             anv_query_address(pool, first_index + i);
690          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
691          emit_query_mi_availability(b, slot_addr, true);
692       }
693       break;
694 
695 #if GFX_VER >= 8
696    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
697       for (uint32_t i = 0; i < num_queries; i++) {
698          for (uint32_t p = 0; p < pool->n_passes; p++) {
699             mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false),
700                          0, 2 * pool->snapshot_size);
701             emit_query_mi_availability(b,
702                                        khr_perf_query_availability_address(pool, first_index + i, p),
703                                        true);
704          }
705       }
706       break;
707    }
708 #endif
709 
710    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
711       for (uint32_t i = 0; i < num_queries; i++) {
712          struct anv_address slot_addr =
713             anv_query_address(pool, first_index + i);
714          mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
715          emit_query_mi_availability(b, slot_addr, true);
716       }
717       break;
718 
719    default:
720       unreachable("Unsupported query type");
721    }
722 }
723 
genX(CmdResetQueryPool)724 void genX(CmdResetQueryPool)(
725     VkCommandBuffer                             commandBuffer,
726     VkQueryPool                                 queryPool,
727     uint32_t                                    firstQuery,
728     uint32_t                                    queryCount)
729 {
730    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
731    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
732 
733    switch (pool->type) {
734    case VK_QUERY_TYPE_OCCLUSION:
735       for (uint32_t i = 0; i < queryCount; i++) {
736          emit_query_pc_availability(cmd_buffer,
737                                     anv_query_address(pool, firstQuery + i),
738                                     false);
739       }
740       break;
741 
742    case VK_QUERY_TYPE_TIMESTAMP: {
743       for (uint32_t i = 0; i < queryCount; i++) {
744          emit_query_pc_availability(cmd_buffer,
745                                     anv_query_address(pool, firstQuery + i),
746                                     false);
747       }
748 
749       /* Add a CS stall here to make sure the PIPE_CONTROL above has
750        * completed. Otherwise some timestamps written later with MI_STORE_*
751        * commands might race with the PIPE_CONTROL in the loop above.
752        */
753       anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
754                                 "vkCmdResetQueryPool of timestamps");
755       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
756       break;
757    }
758 
759    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
760    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
761    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
762       struct mi_builder b;
763       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
764 
765       for (uint32_t i = 0; i < queryCount; i++)
766          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
767       break;
768    }
769 
770 #if GFX_VER >= 8
771    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
772       struct mi_builder b;
773       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
774 
775       for (uint32_t i = 0; i < queryCount; i++) {
776          for (uint32_t p = 0; p < pool->n_passes; p++) {
777             emit_query_mi_availability(
778                &b,
779                khr_perf_query_availability_address(pool, firstQuery + i, p),
780                false);
781          }
782       }
783       break;
784    }
785 #endif
786 
787    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
788       struct mi_builder b;
789       mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
790 
791       for (uint32_t i = 0; i < queryCount; i++)
792          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
793       break;
794    }
795 
796    default:
797       unreachable("Unsupported query type");
798    }
799 }
800 
genX(ResetQueryPool)801 void genX(ResetQueryPool)(
802     VkDevice                                    _device,
803     VkQueryPool                                 queryPool,
804     uint32_t                                    firstQuery,
805     uint32_t                                    queryCount)
806 {
807    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
808 
809    for (uint32_t i = 0; i < queryCount; i++) {
810       if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
811 #if GFX_VER >= 8
812          for (uint32_t p = 0; p < pool->n_passes; p++) {
813             uint64_t *pass_slot = pool->bo->map +
814                khr_perf_query_availability_offset(pool, firstQuery + i, p);
815             *pass_slot = 0;
816          }
817 #endif
818       } else {
819          uint64_t *slot = query_slot(pool, firstQuery + i);
820          *slot = 0;
821       }
822    }
823 }
824 
825 static const uint32_t vk_pipeline_stat_to_reg[] = {
826    GENX(IA_VERTICES_COUNT_num),
827    GENX(IA_PRIMITIVES_COUNT_num),
828    GENX(VS_INVOCATION_COUNT_num),
829    GENX(GS_INVOCATION_COUNT_num),
830    GENX(GS_PRIMITIVES_COUNT_num),
831    GENX(CL_INVOCATION_COUNT_num),
832    GENX(CL_PRIMITIVES_COUNT_num),
833    GENX(PS_INVOCATION_COUNT_num),
834    GENX(HS_INVOCATION_COUNT_num),
835    GENX(DS_INVOCATION_COUNT_num),
836    GENX(CS_INVOCATION_COUNT_num),
837 };
838 
839 static void
emit_pipeline_stat(struct mi_builder * b,uint32_t stat,struct anv_address addr)840 emit_pipeline_stat(struct mi_builder *b, uint32_t stat,
841                    struct anv_address addr)
842 {
843    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
844                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
845 
846    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
847    mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat]));
848 }
849 
850 static void
emit_xfb_query(struct mi_builder * b,uint32_t stream,struct anv_address addr)851 emit_xfb_query(struct mi_builder *b, uint32_t stream,
852                struct anv_address addr)
853 {
854    assert(stream < MAX_XFB_STREAMS);
855 
856    mi_store(b, mi_mem64(anv_address_add(addr, 0)),
857                mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
858    mi_store(b, mi_mem64(anv_address_add(addr, 16)),
859                mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
860 }
861 
862 static void
emit_perf_intel_query(struct anv_cmd_buffer * cmd_buffer,struct anv_query_pool * pool,struct mi_builder * b,struct anv_address query_addr,bool end)863 emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
864                       struct anv_query_pool *pool,
865                       struct mi_builder *b,
866                       struct anv_address query_addr,
867                       bool end)
868 {
869    const struct intel_perf_query_field_layout *layout =
870       &cmd_buffer->device->physical->perf->query_layout;
871    struct anv_address data_addr =
872       anv_address_add(query_addr, intel_perf_query_data_offset(pool, end));
873 
874    for (uint32_t f = 0; f < layout->n_fields; f++) {
875       const struct intel_perf_query_field *field =
876          &layout->fields[end ? f : (layout->n_fields - 1 - f)];
877 
878       switch (field->type) {
879       case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
880          anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
881             rpc.MemoryAddress = anv_address_add(data_addr, field->location);
882          }
883          break;
884 
885       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
886       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
887       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
888       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
889       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
890          struct anv_address addr = anv_address_add(data_addr, field->location);
891          struct mi_value src = field->size == 8 ?
892             mi_reg64(field->mmio_offset) :
893             mi_reg32(field->mmio_offset);
894          struct mi_value dst = field->size == 8 ?
895             mi_mem64(addr) : mi_mem32(addr);
896          mi_store(b, dst, src);
897          break;
898       }
899 
900       default:
901          unreachable("Invalid query field");
902          break;
903       }
904    }
905 }
906 
genX(CmdBeginQuery)907 void genX(CmdBeginQuery)(
908     VkCommandBuffer                             commandBuffer,
909     VkQueryPool                                 queryPool,
910     uint32_t                                    query,
911     VkQueryControlFlags                         flags)
912 {
913    genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
914 }
915 
genX(CmdBeginQueryIndexedEXT)916 void genX(CmdBeginQueryIndexedEXT)(
917     VkCommandBuffer                             commandBuffer,
918     VkQueryPool                                 queryPool,
919     uint32_t                                    query,
920     VkQueryControlFlags                         flags,
921     uint32_t                                    index)
922 {
923    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
924    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
925    struct anv_address query_addr = anv_query_address(pool, query);
926 
927    struct mi_builder b;
928    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
929 
930    switch (pool->type) {
931    case VK_QUERY_TYPE_OCCLUSION:
932       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
933       break;
934 
935    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
936       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
937          pc.CommandStreamerStallEnable = true;
938          pc.StallAtPixelScoreboard = true;
939       }
940       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
941                    mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
942       break;
943 
944    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
945       /* TODO: This might only be necessary for certain stats */
946       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
947          pc.CommandStreamerStallEnable = true;
948          pc.StallAtPixelScoreboard = true;
949       }
950 
951       uint32_t statistics = pool->pipeline_statistics;
952       uint32_t offset = 8;
953       while (statistics) {
954          uint32_t stat = u_bit_scan(&statistics);
955          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
956          offset += 16;
957       }
958       break;
959    }
960 
961    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
962       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
963          pc.CommandStreamerStallEnable = true;
964          pc.StallAtPixelScoreboard = true;
965       }
966       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
967       break;
968 
969 #if GFX_VER >= 8
970    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
971       if (!khr_perf_query_ensure_relocs(cmd_buffer))
972          return;
973 
974       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
975       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
976 
977       uint32_t reloc_idx = 0;
978       for (uint32_t end = 0; end < 2; end++) {
979          for (uint32_t r = 0; r < layout->n_fields; r++) {
980             const struct intel_perf_query_field *field =
981                &layout->fields[end ? r : (layout->n_fields - 1 - r)];
982             struct mi_value reg_addr =
983                mi_iadd(
984                   &b,
985                   mi_imm(intel_canonical_address(pool->bo->offset +
986                                                  khr_perf_query_data_offset(pool, query, 0, end) +
987                                                  field->location)),
988                   mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
989             cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
990 
991             if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC &&
992                 field->size == 8) {
993                reg_addr =
994                   mi_iadd(
995                      &b,
996                      mi_imm(intel_canonical_address(pool->bo->offset +
997                                                     khr_perf_query_data_offset(pool, query, 0, end) +
998                                                     field->location + 4)),
999                      mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1000                cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr);
1001             }
1002          }
1003       }
1004 
1005       struct mi_value availability_write_offset =
1006          mi_iadd(
1007             &b,
1008             mi_imm(
1009                intel_canonical_address(
1010                   pool->bo->offset +
1011                   khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
1012             mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
1013       cmd_buffer->self_mod_locations[reloc_idx++] =
1014          mi_store_address(&b, availability_write_offset);
1015 
1016       assert(reloc_idx == pdevice->n_perf_query_commands);
1017 
1018       mi_self_mod_barrier(&b);
1019 
1020       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1021          pc.CommandStreamerStallEnable = true;
1022          pc.StallAtPixelScoreboard = true;
1023       }
1024       cmd_buffer->perf_query_pool = pool;
1025 
1026       cmd_buffer->perf_reloc_idx = 0;
1027       for (uint32_t r = 0; r < layout->n_fields; r++) {
1028          const struct intel_perf_query_field *field =
1029             &layout->fields[layout->n_fields - 1 - r];
1030          void *dws;
1031 
1032          switch (field->type) {
1033          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1034             dws = anv_batch_emitn(&cmd_buffer->batch,
1035                                   GENX(MI_REPORT_PERF_COUNT_length),
1036                                   GENX(MI_REPORT_PERF_COUNT),
1037                                   .MemoryAddress = query_addr /* Will be overwritten */);
1038             _mi_resolve_address_token(&b,
1039                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1040                                       dws +
1041                                       GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1042             break;
1043 
1044          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1045          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1046          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1047          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1048          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1049             dws =
1050                anv_batch_emitn(&cmd_buffer->batch,
1051                                GENX(MI_STORE_REGISTER_MEM_length),
1052                                GENX(MI_STORE_REGISTER_MEM),
1053                                .RegisterAddress = field->mmio_offset,
1054                                .MemoryAddress = query_addr /* Will be overwritten */ );
1055             _mi_resolve_address_token(&b,
1056                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1057                                       dws +
1058                                       GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1059             if (field->size == 8) {
1060                dws =
1061                   anv_batch_emitn(&cmd_buffer->batch,
1062                                   GENX(MI_STORE_REGISTER_MEM_length),
1063                                   GENX(MI_STORE_REGISTER_MEM),
1064                                   .RegisterAddress = field->mmio_offset + 4,
1065                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1066                _mi_resolve_address_token(&b,
1067                                          cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1068                                          dws +
1069                                          GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1070             }
1071             break;
1072 
1073          default:
1074             unreachable("Invalid query field");
1075             break;
1076          }
1077       }
1078       break;
1079    }
1080 #endif
1081 
1082    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1083       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1084          pc.CommandStreamerStallEnable = true;
1085          pc.StallAtPixelScoreboard = true;
1086       }
1087       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
1088       break;
1089    }
1090 
1091    default:
1092       unreachable("");
1093    }
1094 }
1095 
genX(CmdEndQuery)1096 void genX(CmdEndQuery)(
1097     VkCommandBuffer                             commandBuffer,
1098     VkQueryPool                                 queryPool,
1099     uint32_t                                    query)
1100 {
1101    genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
1102 }
1103 
genX(CmdEndQueryIndexedEXT)1104 void genX(CmdEndQueryIndexedEXT)(
1105     VkCommandBuffer                             commandBuffer,
1106     VkQueryPool                                 queryPool,
1107     uint32_t                                    query,
1108     uint32_t                                    index)
1109 {
1110    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1111    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1112    struct anv_address query_addr = anv_query_address(pool, query);
1113 
1114    struct mi_builder b;
1115    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1116 
1117    switch (pool->type) {
1118    case VK_QUERY_TYPE_OCCLUSION:
1119       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
1120       emit_query_pc_availability(cmd_buffer, query_addr, true);
1121       break;
1122 
1123    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1124       /* Ensure previous commands have completed before capturing the register
1125        * value.
1126        */
1127       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1128          pc.CommandStreamerStallEnable = true;
1129          pc.StallAtPixelScoreboard = true;
1130       }
1131 
1132       mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
1133                    mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
1134       emit_query_mi_availability(&b, query_addr, true);
1135       break;
1136 
1137    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1138       /* TODO: This might only be necessary for certain stats */
1139       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1140          pc.CommandStreamerStallEnable = true;
1141          pc.StallAtPixelScoreboard = true;
1142       }
1143 
1144       uint32_t statistics = pool->pipeline_statistics;
1145       uint32_t offset = 16;
1146       while (statistics) {
1147          uint32_t stat = u_bit_scan(&statistics);
1148          emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
1149          offset += 16;
1150       }
1151 
1152       emit_query_mi_availability(&b, query_addr, true);
1153       break;
1154    }
1155 
1156    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1157       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1158          pc.CommandStreamerStallEnable = true;
1159          pc.StallAtPixelScoreboard = true;
1160       }
1161 
1162       emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
1163       emit_query_mi_availability(&b, query_addr, true);
1164       break;
1165 
1166 #if GFX_VER >= 8
1167    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
1168       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1169          pc.CommandStreamerStallEnable = true;
1170          pc.StallAtPixelScoreboard = true;
1171       }
1172       cmd_buffer->perf_query_pool = pool;
1173 
1174       if (!khr_perf_query_ensure_relocs(cmd_buffer))
1175          return;
1176 
1177       const struct anv_physical_device *pdevice = cmd_buffer->device->physical;
1178       const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout;
1179 
1180       void *dws;
1181       for (uint32_t r = 0; r < layout->n_fields; r++) {
1182          const struct intel_perf_query_field *field = &layout->fields[r];
1183 
1184          switch (field->type) {
1185          case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC:
1186             dws = anv_batch_emitn(&cmd_buffer->batch,
1187                                   GENX(MI_REPORT_PERF_COUNT_length),
1188                                   GENX(MI_REPORT_PERF_COUNT),
1189                                   .MemoryAddress = query_addr /* Will be overwritten */);
1190             _mi_resolve_address_token(&b,
1191                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1192                                       dws +
1193                                       GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
1194             break;
1195 
1196          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
1197          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
1198          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
1199          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
1200          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
1201             dws =
1202                anv_batch_emitn(&cmd_buffer->batch,
1203                                GENX(MI_STORE_REGISTER_MEM_length),
1204                                GENX(MI_STORE_REGISTER_MEM),
1205                                .RegisterAddress = field->mmio_offset,
1206                                .MemoryAddress = query_addr /* Will be overwritten */ );
1207             _mi_resolve_address_token(&b,
1208                                       cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1209                                       dws +
1210                                       GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1211             if (field->size == 8) {
1212                dws =
1213                   anv_batch_emitn(&cmd_buffer->batch,
1214                                   GENX(MI_STORE_REGISTER_MEM_length),
1215                                   GENX(MI_STORE_REGISTER_MEM),
1216                                   .RegisterAddress = field->mmio_offset + 4,
1217                                   .MemoryAddress = query_addr /* Will be overwritten */ );
1218                _mi_resolve_address_token(&b,
1219                                          cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1220                                          dws +
1221                                          GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8);
1222             }
1223             break;
1224 
1225          default:
1226             unreachable("Invalid query field");
1227             break;
1228          }
1229       }
1230 
1231       dws =
1232          anv_batch_emitn(&cmd_buffer->batch,
1233                          GENX(MI_STORE_DATA_IMM_length),
1234                          GENX(MI_STORE_DATA_IMM),
1235                          .ImmediateData = true);
1236       _mi_resolve_address_token(&b,
1237                                 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++],
1238                                 dws +
1239                                 GENX(MI_STORE_DATA_IMM_Address_start) / 8);
1240 
1241       assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
1242       break;
1243    }
1244 #endif
1245 
1246    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
1247       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1248          pc.CommandStreamerStallEnable = true;
1249          pc.StallAtPixelScoreboard = true;
1250       }
1251       uint32_t marker_offset = intel_perf_marker_offset();
1252       mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
1253                    mi_imm(cmd_buffer->intel_perf_marker));
1254       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true);
1255       emit_query_mi_availability(&b, query_addr, true);
1256       break;
1257    }
1258 
1259    default:
1260       unreachable("");
1261    }
1262 
1263    /* When multiview is active the spec requires that N consecutive query
1264     * indices are used, where N is the number of active views in the subpass.
1265     * The spec allows that we only write the results to one of the queries
1266     * but we still need to manage result availability for all the query indices.
1267     * Since we only emit a single query for all active views in the
1268     * first index, mark the other query indices as being already available
1269     * with result 0.
1270     */
1271    if (cmd_buffer->state.gfx.view_mask) {
1272       const uint32_t num_queries =
1273          util_bitcount(cmd_buffer->state.gfx.view_mask);
1274       if (num_queries > 1)
1275          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1276    }
1277 }
1278 
1279 #define TIMESTAMP 0x2358
1280 
genX(CmdWriteTimestamp2)1281 void genX(CmdWriteTimestamp2)(
1282     VkCommandBuffer                             commandBuffer,
1283     VkPipelineStageFlags2                       stage,
1284     VkQueryPool                                 queryPool,
1285     uint32_t                                    query)
1286 {
1287    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1288    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1289    struct anv_address query_addr = anv_query_address(pool, query);
1290 
1291    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
1292 
1293    struct mi_builder b;
1294    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1295 
1296    if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
1297       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
1298                    mi_reg64(TIMESTAMP));
1299       emit_query_mi_availability(&b, query_addr, true);
1300    } else {
1301       /* Everything else is bottom-of-pipe */
1302       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
1303       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1304 
1305       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1306          pc.DestinationAddressType  = DAT_PPGTT;
1307          pc.PostSyncOperation       = WriteTimestamp;
1308          pc.Address                 = anv_address_add(query_addr, 8);
1309 
1310          if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
1311             pc.CommandStreamerStallEnable = true;
1312       }
1313       emit_query_pc_availability(cmd_buffer, query_addr, true);
1314    }
1315 
1316 
1317    /* When multiview is active the spec requires that N consecutive query
1318     * indices are used, where N is the number of active views in the subpass.
1319     * The spec allows that we only write the results to one of the queries
1320     * but we still need to manage result availability for all the query indices.
1321     * Since we only emit a single query for all active views in the
1322     * first index, mark the other query indices as being already available
1323     * with result 0.
1324     */
1325    if (cmd_buffer->state.gfx.view_mask) {
1326       const uint32_t num_queries =
1327          util_bitcount(cmd_buffer->state.gfx.view_mask);
1328       if (num_queries > 1)
1329          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
1330    }
1331 }
1332 
1333 #if GFX_VERx10 >= 75
1334 
1335 #define MI_PREDICATE_SRC0    0x2400
1336 #define MI_PREDICATE_SRC1    0x2408
1337 #define MI_PREDICATE_RESULT  0x2418
1338 
1339 /**
1340  * Writes the results of a query to dst_addr is the value at poll_addr is equal
1341  * to the reference value.
1342  */
1343 static void
gpu_write_query_result_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address poll_addr,struct anv_address dst_addr,uint64_t ref_value,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1344 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
1345                             struct mi_builder *b,
1346                             struct anv_address poll_addr,
1347                             struct anv_address dst_addr,
1348                             uint64_t ref_value,
1349                             VkQueryResultFlags flags,
1350                             uint32_t value_index,
1351                             struct mi_value query_result)
1352 {
1353    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr));
1354    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value));
1355    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1356       mip.LoadOperation    = LOAD_LOAD;
1357       mip.CombineOperation = COMBINE_SET;
1358       mip.CompareOperation = COMPARE_SRCS_EQUAL;
1359    }
1360 
1361    if (flags & VK_QUERY_RESULT_64_BIT) {
1362       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1363       mi_store_if(b, mi_mem64(res_addr), query_result);
1364    } else {
1365       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1366       mi_store_if(b, mi_mem32(res_addr), query_result);
1367    }
1368 }
1369 
1370 static void
gpu_write_query_result(struct mi_builder * b,struct anv_address dst_addr,VkQueryResultFlags flags,uint32_t value_index,struct mi_value query_result)1371 gpu_write_query_result(struct mi_builder *b,
1372                        struct anv_address dst_addr,
1373                        VkQueryResultFlags flags,
1374                        uint32_t value_index,
1375                        struct mi_value query_result)
1376 {
1377    if (flags & VK_QUERY_RESULT_64_BIT) {
1378       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
1379       mi_store(b, mi_mem64(res_addr), query_result);
1380    } else {
1381       struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
1382       mi_store(b, mi_mem32(res_addr), query_result);
1383    }
1384 }
1385 
1386 static struct mi_value
compute_query_result(struct mi_builder * b,struct anv_address addr)1387 compute_query_result(struct mi_builder *b, struct anv_address addr)
1388 {
1389    return mi_isub(b, mi_mem64(anv_address_add(addr, 8)),
1390                      mi_mem64(anv_address_add(addr, 0)));
1391 }
1392 
genX(CmdCopyQueryPoolResults)1393 void genX(CmdCopyQueryPoolResults)(
1394     VkCommandBuffer                             commandBuffer,
1395     VkQueryPool                                 queryPool,
1396     uint32_t                                    firstQuery,
1397     uint32_t                                    queryCount,
1398     VkBuffer                                    destBuffer,
1399     VkDeviceSize                                destOffset,
1400     VkDeviceSize                                destStride,
1401     VkQueryResultFlags                          flags)
1402 {
1403    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1404    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
1405    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
1406 
1407    struct mi_builder b;
1408    mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
1409    struct mi_value result;
1410 
1411    /* If render target writes are ongoing, request a render target cache flush
1412     * to ensure proper ordering of the commands from the 3d pipe and the
1413     * command streamer.
1414     */
1415    if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
1416       anv_add_pending_pipe_bits(cmd_buffer,
1417                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT |
1418                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
1419                                 "CopyQueryPoolResults");
1420    }
1421 
1422    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
1423        (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
1424        /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1425         * because we're about to copy values from MI commands, we need to
1426         * stall the command streamer to make sure the PIPE_CONTROL values have
1427         * landed, otherwise we could see inconsistent values & availability.
1428         *
1429         *  From the vulkan spec:
1430         *
1431         *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1432         *     previous uses of vkCmdResetQueryPool in the same queue, without
1433         *     any additional synchronization."
1434         */
1435        pool->type == VK_QUERY_TYPE_OCCLUSION ||
1436        pool->type == VK_QUERY_TYPE_TIMESTAMP) {
1437       anv_add_pending_pipe_bits(cmd_buffer,
1438                                 ANV_PIPE_CS_STALL_BIT,
1439                                 "CopyQueryPoolResults");
1440       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1441    }
1442 
1443    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
1444    for (uint32_t i = 0; i < queryCount; i++) {
1445       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
1446       uint32_t idx = 0;
1447       switch (pool->type) {
1448       case VK_QUERY_TYPE_OCCLUSION:
1449       case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
1450          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1451          /* Like in the case of vkGetQueryPoolResults, if the query is
1452           * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1453           * conservatively write 0 as the query result. If the
1454           * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1455           */
1456          gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1457                1 /* available */, flags, idx, result);
1458          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1459             gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1460                   0 /* unavailable */, flags, idx, mi_imm(0));
1461          }
1462          idx++;
1463          break;
1464 
1465       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1466          uint32_t statistics = pool->pipeline_statistics;
1467          while (statistics) {
1468             uint32_t stat = u_bit_scan(&statistics);
1469 
1470             result = compute_query_result(&b, anv_address_add(query_addr,
1471                                                               idx * 16 + 8));
1472 
1473             /* WaDividePSInvocationCountBy4:HSW,BDW */
1474             if ((cmd_buffer->device->info.ver == 8 ||
1475                  cmd_buffer->device->info.verx10 == 75) &&
1476                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1477                result = mi_ushr32_imm(&b, result, 2);
1478             }
1479 
1480             gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1481          }
1482          assert(idx == util_bitcount(pool->pipeline_statistics));
1483          break;
1484       }
1485 
1486       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1487          result = compute_query_result(&b, anv_address_add(query_addr, 8));
1488          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1489          result = compute_query_result(&b, anv_address_add(query_addr, 24));
1490          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1491          break;
1492 
1493       case VK_QUERY_TYPE_TIMESTAMP:
1494          result = mi_mem64(anv_address_add(query_addr, 8));
1495          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1496          break;
1497 
1498 #if GFX_VER >= 8
1499       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
1500          unreachable("Copy KHR performance query results not implemented");
1501          break;
1502 #endif
1503 
1504       default:
1505          unreachable("unhandled query type");
1506       }
1507 
1508       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1509          gpu_write_query_result(&b, dest_addr, flags, idx,
1510                                 mi_mem64(query_addr));
1511       }
1512 
1513       dest_addr = anv_address_add(dest_addr, destStride);
1514    }
1515 }
1516 
1517 #else
genX(CmdCopyQueryPoolResults)1518 void genX(CmdCopyQueryPoolResults)(
1519     VkCommandBuffer                             commandBuffer,
1520     VkQueryPool                                 queryPool,
1521     uint32_t                                    firstQuery,
1522     uint32_t                                    queryCount,
1523     VkBuffer                                    destBuffer,
1524     VkDeviceSize                                destOffset,
1525     VkDeviceSize                                destStride,
1526     VkQueryResultFlags                          flags)
1527 {
1528    anv_finishme("Queries not yet supported on Ivy Bridge");
1529 }
1530 #endif
1531