/* * Copyright (c) Qualcomm Innovation Center, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ // A simple llama2/3 runner that includes preprocessing and post processing // logic. The module takes in a string as input and emits a string as output. #pragma once #include #include #include #include #include #include #include #include #include namespace example { class Runner { public: explicit Runner( const std::vector& models_path, const std::vector& pos_embs_path, const std::vector& shard_layers, const std::string& tokenizer_path, const int eval_mode, const float temperature, const float logits_scale, const int logits_offset); struct Stats { // Scaling factor for timestamps - in this case, we use ms. const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; // Time stamps for the different stages of the execution // model_load_start_ms: Start of model loading. long model_load_start_ms; // model_load_end_ms: End of model loading. long model_load_end_ms; // inference_start_ms: Immediately after the model is loaded (or we check // for model load), measure the inference time. long inference_start_ms; // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right // before the inference loop starts long prompt_eval_end_ms; // first_token: Timestamp when the first generated token is emitted long first_token_ms; // inference_end_ms: End of inference/generation. long inference_end_ms; // Keep a running total of the time spent in sampling. long aggregate_sampling_time_ms; // Token count from prompt int64_t num_prompt_tokens; // Token count from generated (total - prompt) int64_t num_generated_tokens; }; bool is_loaded() const; executorch::runtime::Error load(); executorch::runtime::Error generate( const std::string& prompt, const std::string& system_prompt, int32_t seq_len, std::function token_callback = {}, std::function stats_callback = {}); void stop(); std::vector> get_methods_meta(); private: enum EvalMode { kBert = 0, kKVCached, kUnsupported, }; enum LlamaVersion { kLlama2 = 0, kLlama3, }; int32_t logitsToToken(const executorch::aten::Tensor& logits_tensor); void run_model_step( std::vector>& inputs); // metadata int32_t bos_id_; std::unordered_set eos_id_; const int32_t n_bos_; const int32_t n_eos_; const int32_t vocab_size_; const int32_t max_seq_len_; int32_t eval_mode_; std::vector> modules_; std::vector method_names_; std::string tokenizer_path_; float temperature_; std::unique_ptr tokenizer_; std::unique_ptr sampler_; Stats stats_; std::unique_ptr io_mem_; const float logits_scale_; const int32_t logits_offset_; LlamaVersion version_; }; } // namespace example