diff --git a/common/arg.cpp b/common/arg.cpp index e0f1d998f6056..0221faa80a8c4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2628,6 +2628,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--show-statistics"}, + string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"), + [](common_params & params) { + params.show_statistics = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--parse-special"}, string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"), diff --git a/common/common.h b/common/common.h index d051d4ec971c4..35c5e4bdc8f29 100644 --- a/common/common.h +++ b/common/common.h @@ -407,9 +407,10 @@ struct common_params { int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t i_chunk = 0; // start processing from this chunk - bool process_output = false; // collect data for the output tensor - bool compute_ppl = true; // whether to compute perplexity - bool parse_special = false; // whether to parse special tokens during imatrix tokenization + bool process_output = false; // collect data for the output tensor + bool compute_ppl = true; // whether to compute perplexity + bool show_statistics = false; // show imatrix statistics per tensor + bool parse_special = false; // whether to parse special tokens during imatrix tokenization // cvector-generator params int n_pca_batch = 100; diff --git a/tools/imatrix/CMakeLists.txt b/tools/imatrix/CMakeLists.txt index 412696c47c31c..078e73161dd10 100644 --- a/tools/imatrix/CMakeLists.txt +++ b/tools/imatrix/CMakeLists.txt @@ -2,4 +2,5 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(${TARGET} PRIVATE ../../src) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 81d0404d683d5..d81416809ffea 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -21,10 +23,9 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); - LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] \\\n" + LOG("\n %s -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output]\n" + " [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics]\n" + " [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" " [--parse-special]\n" , argv[0]); LOG("\n"); } @@ -35,13 +36,28 @@ struct Stats { int ncall = 0; }; +struct tensor_statistics { + std::string tensor; + Stats stats; + float total_bias = 0; + float mean_bias = 0; + float max_bias = 0; + float min_bias = 0; + int elements = 0; + float stddev = 0; + float active = 0; + float entropy = 0; + float zd = 0; + float cossim = 0; +}; + class IMatrixCollector { public: IMatrixCollector() = default; void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * fname); + bool load_imatrix(const char * fname, std::vector * tstats = nullptr); private: std::unordered_map m_stats; common_params m_params; @@ -70,6 +86,35 @@ static std::string filter_tensor_name(const char * name) { return wname; } +static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) { + std::vector name; + std::istringstream stream(input); + std::string item; + + while (std::getline(stream, item, '.')) { + name.push_back(item); + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "blk" && i + 1 < name.size()) { + layer = name[i + 1]; + break; + } + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "weight" && i > 0) { + tensor = name[i - 1]; + break; + } + } + + if (tensor.empty()) { + tensor = input; + } + if (layer.empty()) { + layer = "-"; + } +} + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); @@ -292,7 +337,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } -bool IMatrixCollector::load_imatrix(const char * fname) { +bool IMatrixCollector::load_imatrix(const char * fname, std::vector * tstats) { std::ifstream in(fname, std::ios::binary); if (!in) { LOG_ERR("%s: failed to open %s\n",__func__, fname); @@ -338,14 +383,81 @@ bool IMatrixCollector::load_imatrix(const char * fname) { return false; } - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. + // Recreate the state as expected by save_imatrix(), and correct for weighted sum. + std::vector activations; + activations.reserve(nval); for (int i = 0; i < nval; i++) { e.values[i] += tmp[i]; e.counts[i] += ncall; + activations.push_back(e.values[i] / e.counts[i]); } e.ncall += ncall; + if (tstats) { + float total = std::accumulate(activations.begin(), activations.end(), 0.0f); + float max = * std::max_element(activations.begin(), activations.end()); + float min = * std::min_element(activations.begin(), activations.end()); + float mean = total / activations.size(); + float sq_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + float dev = std::sqrt((sq_total / activations.size()) - (mean * mean)); + + float threshold = min + min * 0.5f; + int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) <= threshold; }); + float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + + float ent = 0; + if (total > 0) { + for (auto act : activations) { + if (float p = act / total; p > 0) { + ent -= p* std::log2(p); + } + } + } + + int z_score = 0; + for (auto act : activations) { + if (float p = (act - mean) / dev; p > 1) { + z_score++; + } + } + + tstats->emplace_back(); + auto & ts = (*tstats)[i]; + ts.tensor = name_as_vec.data(); + ts.stats = e; + ts.total_bias = total; + ts.mean_bias = mean; + ts.max_bias = max; + ts.min_bias = min; + ts.elements = static_cast(activations.size()); + ts.stddev = dev; + ts.active = active_ratio; + ts.entropy = ent; + ts.zd = static_cast(z_score) / ts.elements; + } + } + + if (tstats) { + static const std::regex pattern(R"(blk\.(\d+)\.)"); + for (auto & ts : *tstats) { + if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string tname(ts.tensor); + tname.replace(match.position(1), match.length(1), std::to_string(blk-1)); + auto prev = std::find_if(tstats->begin(), tstats->end(), [tname](const tensor_statistics & t) { return t.tensor == tname; }); + if (prev != tstats->end()) { + const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), prev->stats.values.begin(), 0.0f); + const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), ts.stats.values.begin(), 0.0f)); + const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), prev->stats.values.begin(), 0.0f)); + const float cs = dp / (curr_mag * prev_mag); + ts.cossim = cs; + } + } else { + ts.cossim = 0; + } + } } + return true; } @@ -355,7 +467,6 @@ static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_dat return g_collector.collect_imatrix(t, ask, user_data); } - struct results_log_softmax { double log_softmax; float logit; @@ -592,6 +703,49 @@ int main(int argc, char ** argv) { return 1; } + std::vector ts; + + if (params.show_statistics) { + if (params.in_files.empty() || params.in_files.size() > 1) { + LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); + return 1; + } + if (!g_collector.load_imatrix(params.in_files[0].c_str(), & ts)) { + LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); + return 1; + } + if (ts.empty()) { + LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); + return 1; + } + + struct tensor_comparer { + bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { + std::string layer, name_a, name_b;; + process_tensor_name(a.tensor, layer, name_a); + process_tensor_name(b.tensor, layer, name_b); + return name_a < name_b || (name_a == name_b && a.total_bias > b.total_bias); + } + }; + std::sort(ts.begin(), ts.end(), tensor_comparer()); + + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + " Layer", " Tensor", " Σ(Bias)", " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", " CosSim"); + LOG_INF("=========================================================================================================================================================================\n"); + for (const auto & tstat : ts) { + std::string layer, name; + process_tensor_name(tstat.tensor, layer, name); + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", + layer.c_str(), name.c_str(), tstat.total_bias, tstat.min_bias, tstat.max_bias, tstat.mean_bias, tstat.stddev, + tstat.active * 100.0f, tstat.elements, tstat.entropy, 100.0f * (tstat.entropy / std::log2(tstat.elements)), + 100.0f * tstat.zd, tstat.cossim); + } + LOG_INF("\n"); + + return 0; + } + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx);