zhangyang
/
zhipuzi_pos_windows


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							#include "../pch/pch.h"

#include "YoloFeatureExtractor.h"
#include <fstream>
#include <algorithm>
#include <iostream>
#include <functional>
#include <numeric>

#include "../tool/debuglog.h"

YoloFeatureExtractor::YoloFeatureExtractor(const std::string & modelPath, const std::string & classesPath)
	: inputWidth(224), inputHeight(224)
{
	net = cv::dnn::readNetFromONNX(modelPath);
	loadClassNames(classesPath);
}

void YoloFeatureExtractor::loadClassNames(const std::string & file)
{
	std::ifstream ifs(file);
	std::string line;
	while (std::getline(ifs, line))
	{
		classNames.push_back(line);
	}
}

std::vector<float> YoloFeatureExtractor::globalAveragePooling(const cv::Mat & featureMap)
{
	std::vector<float> features;

	// 检查特征图是否为空
	if (featureMap.empty())
	{
		std::cerr << "特征图为空" << std::endl;
		return features;
	}

	// 获取特征图维度信息
	int dims = featureMap.dims;
	if (dims < 2)
	{
		std::cerr << "特征图维度不足" << std::endl;
		return features;
	}

	// featureMap形状: [1, channels, height, width]
	int channels = featureMap.size[1];
	int height = featureMap.size[2];
	int width = featureMap.size[3];

	// 重塑为 [channels, height*width]
	cv::Mat reshaped = featureMap.reshape(1, channels);
	cv::Mat pooled;

	// 对每个通道进行平均池化
	cv::reduce(reshaped, pooled, 1, cv::REDUCE_AVG);

	// 重塑为 [1, channels] 特征向量
	return pooled.reshape(1, 1);
}

std::vector<float> YoloFeatureExtractor::extractFeatures(const std::string & imagePath)
{
	try
	{
		auto time_1 = std::chrono::high_resolution_clock::now();

		cv::Mat image = cv::imread(imagePath);
		if (image.empty())
		{
			throw std::runtime_error("Could not load image: " + imagePath);
		}

		cv::Mat resizedImage;
		//cv::resize(image, resizedImage, cv::Size(inputWidth, inputHeight));

		cv::Mat blob;
		cv::dnn::blobFromImage(image, blob, 1.0 / 255.0, cv::Size(inputWidth, inputHeight), cv::Scalar(0, 0, 0), true, false);
		net.setInput(blob);

		auto time_2 = std::chrono::high_resolution_clock::now();

		std::vector<cv::String> layerNames = net.getLayerNames();
		std::vector<cv::String> outputNames;

		// 选择GAP层（对于yolo2026，通常是倒数第6层）的输出作为特征向量
		outputNames.push_back(layerNames[layerNames.size() - 6]);

		std::vector<cv::Mat> outputs;
		net.forward(outputs, outputNames);

		auto time_3 = std::chrono::high_resolution_clock::now();

		// 检查输出是否有效
		if (outputs.empty() || outputs[0].empty())
		{
			throw std::runtime_error("模型前向传播未产生有效输出");
		}

		// 获取GAP层输出并转换为特征向量
		//cv::Mat featuresMat = outputs[0].reshape(1, 1);
		cv::Mat featuresMat = outputs[0];
		cv::normalize(featuresMat, featuresMat, 1.0, 0.0, cv::NORM_L2);

		// 转换为std::vector<float>
		std::vector<float> features(featuresMat.begin<float>(), featuresMat.end<float>());

		/*
		// 应用全局平均池化获取特征向量
		//std::vector<float> features = globalAveragePooling(outputs[0]);

		// L2归一化特征向量
		if (!features.empty())
		{
			float norm = std::sqrt(std::inner_product(features.begin(), features.end(), features.begin(), 0.0f));
			if (norm > 1e-6)
			{
				for (auto & val : features)
				{
					val /= norm;
				}
			}
		}*/

		auto time_4 = std::chrono::high_resolution_clock::now();


		auto duration_1 = std::chrono::duration_cast<std::chrono::milliseconds>(time_2 - time_1);
		std::wstring msg = L"图片处理完成，耗时: " + std::to_wstring(duration_1.count()) + L" 毫秒";
		DEBUG_LOG(msg.c_str());

		auto duration_2 = std::chrono::duration_cast<std::chrono::milliseconds>(time_3 - time_2);
		std::wstring msg2 = L"模型前向传播完成，耗时: " + std::to_wstring(duration_2.count()) + L" 毫秒";
		DEBUG_LOG(msg2.c_str());

		auto duration_3 = std::chrono::duration_cast<std::chrono::milliseconds>(time_4 - time_3);
		std::wstring msg3 = L"特征处理完成，耗时: " + std::to_wstring(duration_3.count()) + L" 毫秒";
		DEBUG_LOG(msg3.c_str());

		auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(time_4 - time_1);
		std::wstring msg4 = L"总耗时: " + std::to_wstring(totalDuration.count()) + L" 毫秒";
		DEBUG_LOG(msg4.c_str());

		return features;
	}
	catch (const std::exception & e)
	{
		std::string aa = std::string(e.what());
		DEBUG_LOG(("提取特征失败: " + std::string(e.what())).c_str());
		return {};
	}
}

std::vector<float> YoloFeatureExtractor::extractBackboneFeatures(const std::string & imagePath)
{
	cv::Mat image = cv::imread(imagePath);
	if (image.empty())
	{
		throw std::runtime_error("Could not load image: " + imagePath);
	}

	cv::Mat blob;
	cv::dnn::blobFromImage(image, blob, 1.0 / 255.0, cv::Size(inputWidth, inputHeight), cv::Scalar(0, 0, 0), true, false);
	net.setInput(blob);

	std::vector<cv::String> layerNames = net.getLayerNames();
	std::vector<cv::String> backboneLayers;

	for (const auto & name : layerNames)
	{
		if (name.find("backbone") != std::string::npos ||
			name.find("conv") != std::string::npos ||
			name.find("stage") != std::string::npos)
		{
			backboneLayers.push_back(name);
		}
	}

	if (backboneLayers.empty())
	{
		backboneLayers.push_back(layerNames[layerNames.size() / 2]);
	}

	std::vector<cv::Mat> outputs;
	net.forward(outputs, backboneLayers);

	std::vector<float> features;
	for (size_t i = 0; i < outputs.size(); ++i)
	{
		cv::Mat output = outputs[i];
		features.reserve(features.size() + output.total());
		for (int j = 0; j < output.total(); ++j)
		{
			features.push_back(output.at<float>(j));
		}
	}

	return features;
}

std::vector<std::vector<float>> YoloFeatureExtractor::extractROIFeatures(const std::string & imagePath)
{
	cv::Mat image = cv::imread(imagePath);
	if (image.empty())
	{
		throw std::runtime_error("Could not load image: " + imagePath);
	}

	cv::Mat blob;
	cv::dnn::blobFromImage(image, blob, 1.0 / 255.0, cv::Size(inputWidth, inputHeight), cv::Scalar(0, 0, 0), true, false);
	net.setInput(blob);

	std::vector<cv::Mat> outputs;
	net.forward(outputs, net.getUnconnectedOutLayersNames());

	const float CONFIDENCE_THRESHOLD = 0.5;
	const float NMS_THRESHOLD = 0.4;

	std::vector<int> classIds;
	std::vector<float> confidences;
	std::vector<cv::Rect> boxes;

	float x_factor = static_cast<float>(image.cols) / inputWidth;
	float y_factor = static_cast<float>(image.rows) / inputHeight;

	for (size_t outputIdx = 0; outputIdx < outputs.size(); ++outputIdx)
	{
		float * data = (float *)outputs[outputIdx].data;
		int rows = outputs[outputIdx].rows;
		int dimensions = outputs[outputIdx].cols;

		for (int i = 0; i < rows; ++i)
		{
			float objectness = data[4];
			if (objectness >= CONFIDENCE_THRESHOLD)
			{
				std::vector<float> probs;
				for (int c = 5; c < dimensions; ++c)
				{
					probs.push_back(data[c]);
				}

				int maxClassId = 0;
				float maxScore = probs[0];
				for (size_t p = 1; p < probs.size(); ++p)
				{
					if (probs[p] > maxScore)
					{
						maxScore = probs[p];
						maxClassId = static_cast<int>(p);
					}
				}

				if (maxScore > CONFIDENCE_THRESHOLD)
				{
					confidences.push_back(objectness * maxScore);
					classIds.push_back(maxClassId);

					float x = data[0];
					float y = data[1];
					float w = data[2];
					float h = data[3];

					int left = static_cast<int>((x - 0.5 * w) * x_factor);
					int top = static_cast<int>((y - 0.5 * h) * y_factor);
					int width = static_cast<int>(w * x_factor);
					int height = static_cast<int>(h * y_factor);

					boxes.push_back(cv::Rect(left, top, width, height));
				}
			}
			data += dimensions;
		}
	}

	std::vector<int> nms_result;
	cv::dnn::NMSBoxes(boxes, confidences, CONFIDENCE_THRESHOLD, NMS_THRESHOLD, nms_result);

	std::vector<std::vector<float>> roiFeatures;
	for (size_t i = 0; i < nms_result.size(); ++i)
	{
		int idx = nms_result[i];
		cv::Rect box = boxes[idx];

		box.x = std::max(0, std::min(box.x, image.cols - 1));
		box.y = std::max(0, std::min(box.y, image.rows - 1));
		box.width = std::max(0, std::min(box.width, image.cols - box.x));
		box.height = std::max(0, std::min(box.height, image.rows - box.y));

		std::vector<float> roiFeature;
		roiFeature.push_back(static_cast<float>(box.x) / image.cols);
		roiFeature.push_back(static_cast<float>(box.y) / image.rows);
		roiFeature.push_back(static_cast<float>(box.width) / image.cols);
		roiFeature.push_back(static_cast<float>(box.height) / image.rows);
		roiFeature.push_back(confidences[idx]);
		roiFeature.push_back(static_cast<float>(classIds[idx]));

		roiFeatures.push_back(roiFeature);
	}

	return roiFeatures;
}