It all began with my curiosity about how machine learning models work after joining a data science competition at my university. We were tasked with creating a fraud classifier model, and my team decided to use an ensemble model consisting of XGBoost (XGB) and LightGBM (LGB). We achieved around 28% accuracy, but unfortunately, my team didn’t make it to the finals. Later, I noticed announcements that similar models could be built with neural networks. I realized these models could be exported to a .pt file, which was familiar to me from my research on using YOLOv8 models for inference. This led me to conclude that I could apply a similar approach to a vision model similar to YOLO, prompting my research on yolo.cpp.

It always begins with wondering how—how am I supposed to do this when I don't know anything at all? It's not like I suddenly got some revelation in a dream. So, I started by drawing a flowchart of how I assumed things worked. I concluded that there are two main parts of the code I need to use:
Data loader
Neural Networks
I assumed the data loader would be relatively easy to make since I learned a bunch of file handling in C during my first semester. However, the main problem was the data processor. Since I wanted to create everything from scratch, I decided not to use any external libraries like TensorFlow, NumPy, or PyTorch, and I wanted to make it run fast (spoiler: it didn't). So, I decided to make it in C++.
I also changed my approach to writing code. I noticed that experienced people always write functions with their descriptions below them, so I tried that too. Surprisingly, it made the code more readable than just writing the code without explanations.
First, I researched what dataset the YOLO pretrained model uses, and I found out it’s COCO. Without fully understanding what COCO is or how to use it, I tried to use it blindly. I soon discovered that the dataset is actually 25 GB, and I realized it wouldn't be possible to download it due to my limited resources.
Then, I searched for a more viable solution for the dataset and found an interesting one called CIFAR-10. It consists of 10 classes, 32x32 images, and is less than 500 MB. So, I began to write my CIFAR-10 loader header file.
#include "cifar10_loader.h"
#include <stdio.h>
#include <stdlib.h>
std::vector<CIFAR10Image> load_cifar10_bin(const char* data_path) {
std::vector<CIFAR10Image> dataset(NUM_IMAGES);
FILE* file = fopen(data_path, "rb");
if (!file) {
fprintf(stderr, "Error opening file: %s\n", data_path);
exit(1);
}
for (int i = 0; i < NUM_IMAGES; ++i) {
unsigned char label;
fread(&label, LABEL_BYTES, 1, file);
unsigned char buffer[IMAGE_BYTES];
fread(buffer, IMAGE_BYTES, 1, file);
cv::Mat img(IMAGE_SIZE, IMAGE_SIZE, CV_8UC3);
for (int row = 0; row < IMAGE_SIZE; ++row) {
for (int col = 0; col < IMAGE_SIZE; ++col) {
img.at<cv::Vec3b>(row, col)[0] = buffer[row * IMAGE_SIZE + col]; // Blue
img.at<cv::Vec3b>(row, col)[1] = buffer[IMAGE_SIZE * IMAGE_SIZE + row * IMAGE_SIZE + col]; // Green
img.at<cv::Vec3b>(row, col)[2] = buffer[2 * IMAGE_SIZE * IMAGE_SIZE + row * IMAGE_SIZE + col]; // Red
}
}
dataset[i].image = img;
dataset[i].label = (int)label;
}
fclose(file);
return dataset;
}
and it worked great. With that in place, I could use it for the main.cpp. Now, it was time to face the second problem: the data processor.
I wrote these three main functions inside tiny_network.c and then improved them with some openmp and i added some clear comments for what’s happening on the important part of the process on the neural networks :
NeuralNetwork::NeuralNetwork(const std::vector<int>& layer_sizes) : layer_sizes(layer_sizes) {
// Initialize weights and biases randomly
for (size_t i = 1; i < layer_sizes.size(); ++i) {
int rows = layer_sizes[i];
int cols = layer_sizes[i - 1];
std::vector<std::vector<float>> layer_weights(rows, std::vector<float>(cols));
std::vector<float> layer_biases(rows);
// Initialize weights with small random values and biases with zeros
for (int j = 0; j < rows; ++j) {
std::generate(layer_weights[j].begin(), layer_weights[j].end(), []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
layer_biases[j] = 0.0f;
}
weights.push_back(layer_weights);
biases.push_back(layer_biases);
}
}
std::vector<float> NeuralNetwork::forward(const std::vector<float>& input) {
activations.clear();
z_values.clear();
activations.push_back(input);
std::vector<float> activation = input;
for (size_t i = 0; i < weights.size(); ++i) {
std::vector<float> z(layer_sizes[i + 1], 0.0f);
#pragma omp parallel for // Parallelize the outer loop
for (size_t j = 0; j < layer_sizes[i + 1]; ++j) {
for (size_t k = 0; k < layer_sizes[i]; ++k) {
z[j] += weights[i][j][k] * activation[k];
}
z[j] += biases[i][j];
z[j] = 1.0 / (1.0 + std::exp(-z[j])); // Sigmoid activation
}
z_values.push_back(z);
activation = z;
activations.push_back(activation);
}
return activation;
}
void NeuralNetwork::backward(const std::vector<float>& input, const std::vector<float>& target, float learning_rate) {
std::vector<float> output_gradients = activations.back();
for (size_t i = 0; i < output_gradients.size(); ++i) {
output_gradients[i] -= target[i];
}
std::vector<std::vector<float>> hidden_gradients(weights.size());
for (int l = weights.size() - 1; l >= 0; --l) {
hidden_gradients[l].resize(layer_sizes[l + 1], 0.0f);
#pragma omp parallel for // Parallelize the outer loop
for (size_t j = 0; j < layer_sizes[l + 1]; ++j) {
float gradient = output_gradients[j] * activations[l + 1][j] * (1.0f - activations[l + 1][j]);
hidden_gradients[l][j] = gradient;
for (size_t k = 0; k < layer_sizes[l]; ++k) {
#pragma omp atomic // Ensure atomic operation for weight update
weights[l][j][k] -= learning_rate * gradient * activations[l][k];
}
#pragma omp atomic // Ensure atomic operation for bias update
biases[l][j] -= learning_rate * gradient;
}
if (l > 0) {
std::vector<float> next_output_gradients(layer_sizes[l], 0.0f);
#pragma omp parallel for // Parallelize the outer loop
for (size_t k = 0; k < layer_sizes[l]; ++k) {
for (size_t j = 0; j < layer_sizes[l + 1]; ++j) {
next_output_gradients[k] += hidden_gradients[l][j] * weights[l][j][k];
}
}
output_gradients = next_output_gradients;
}
}
}
8: NeuralNetwork::NeuralNetwork(const std::vector<int>& layer_sizes) : layer_sizes(layer_sizes) {
This line defines the constructor for the NeuralNetwork class. It takes a vector of integers as input, which represents the sizes of each layer in the neural network.
10: for (size_t i = 1; i < layer_sizes.size(); ++i) {
This loop initializes the weights and biases for each layer in the neural network.
13: std::vector<std::vector<float>> layer_weights(rows, std::vector<float>(cols));
14: std::vector<float> layer_biases(rows);
These lines declare the weights and biases for each layer.
17: std::generate(layer_weights[j].begin(), layer_weights[j].end(), []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
18: layer_biases[j] = 0.0f;
These lines initialize the weights with small random values and biases with zeros.
20: weights.push_back(layer_weights);
21: biases.push_back(layer_biases);
These lines add the initialized weights and biases to the respective vectors.
25: std::vector<float> NeuralNetwork::forward(const std::vector<float>& input) {
This line defines the forward function, which performs a forward pass through the neural network.
30: std::vector<float> activation = input;
This line initializes the activation vector with the input.
34: #pragma omp parallel for // Parallelize the outer loop
This line uses OpenMP to parallelize the outer loop for performance.
37: z[j] += weights[i][j][k] * activation[k];
This line calculates the weighted sum of the inputs.
40: z[j] = 1.0 / (1.0 + std::exp(-z[j])); // Sigmoid activation
This line applies the sigmoid activation function.
46: return activation;
This line returns the final activation vector.
49: void NeuralNetwork::backward(const std::vector<float>& input, const std::vector<float>& target, float learning_rate) {
This line defines the backward function, which performs backpropagation to update the weights and biases.
52: output_gradients[i] -= target[i];
This line calculates the output gradients by subtracting the target from the activations.
59: #pragma omp parallel for // Parallelize the outer loop
This line uses OpenMP to parallelize the outer loop for performance.
61: float gradient = output_gradients[j] * activations[l + 1][j] * (1.0f - activations[l + 1][j]);
This line calculates the gradient for the hidden layers.
65: weights[l][j][k] -= learning_rate * gradient * activations[l][k];
This line updates the weights using the calculated gradient and the learning rate.
68: biases[l][j] -= learning_rate * gradient;
This line updates the biases using the calculated gradient and the learning rate.
77: next_output_gradients[k] += hidden_gradients[l][j] * weights[l][j][k];
This line calculates the gradients for the next layer.
80: output_gradients = next_output_gradients;
This line updates the output gradients for the next iteration of the loop.
as you can see i have no code for any GPU optimization and relay only on CPUs since i didn’t have any GPU to test the code on, I’d assume i need to learn some C/CUDA to maximize the GPU. then we can move on on the main.cpp and i have added some clear comments so i don’t have to explain line per line.
#include <opencv2/opencv.hpp>
#include "cifar10_loader.h"
#include "TinyNn_optimized.h"
#include "preprocess.h"
#define NUM_DATA 5
#define TEST_PATH "dataset/test_batch.bin"
#define LEARNING_RATE 0.5
#define NUM_EPOCHS 1000
#define BATCH_SIZE 64
int main() {
//loading the dataset, boring stuffs
std::vector<CIFAR10Image> combined_dataset;
for(int i = 0; i < NUM_DATA; i++){
std::string data_path = "dataset/data_batch_" + std::to_string(i + 1) + ".bin";
// Example of using the data path
std::cout << "Processing file: " << data_path << std::endl;
std::vector<CIFAR10Image> dataset = load_cifar10_bin(data_path.c_str());
// Append the loaded dataset to the combined_dataset vector
if (!dataset.empty()) {
combined_dataset.insert(combined_dataset.end(), dataset.begin(), dataset.end());
} else {
std::cerr << "Failed to load dataset from " << data_path << std::endl;
}
}
std::cout << "Total number of images in the combined dataset: " << combined_dataset.size() << std::endl;
//function demo
//const char* data_path = "dataset/data_batch_1.bin";
//std::vector<CIFAR10Image> dataset = load_cifar10_bin(data_path);
//if (!combined_dataset.empty()) {
// cv::imshow("CIFAR-10 Image", combined_dataset[0].image);
// printf("Label: %d\n", combined_dataset[0].label);
// cv::waitKey(0);
//}
//pre-cooking || preprocess the dataset
std::vector<cv::Mat> images;
std::vector<int> labels;
for (const auto& item : combined_dataset) {
images.push_back(preprocess(item.image));
labels.push_back(item.label);
}
// Split into train and validation sets (80% train, 20% validation)
int num_train = static_cast<int>(0.8 * images.size());
std::vector<cv::Mat> train_images(images.begin(), images.begin() + num_train);
std::vector<int> train_labels(labels.begin(), labels.begin() + num_train);
std::vector<cv::Mat> val_images(images.begin() + num_train, images.end());
std::vector<int> val_labels(labels.begin() + num_train, labels.end());
//cooking stuffs
//each numbers in layer_sizes in order is input, first hidden layer, second hidden layer and finally an
//output layer, the reasoning in each numbers is:
//3072 => because of 32 x 32 x 3 after flattening of cifar10 dataset,
//128 is just a wild guess for first hidden layers (?) or some reasoning of math that
//myself couldnt understand yet, goes the same for the second hidden layers,
//and finally the output layers, 10 is for representative of confidence in each classes
//so if i used cifar100 it would be 100 instead of 10, but i guess the hidden layer cant be less
//than the output layer so there were supposed to be an adjustment if i were using a different
//datasets
std::vector<int> layer_sizes = {3072, 128, 64, 10}; //input -> hidden1 -> hidden2 -> output
NeuralNetwork nn(layer_sizes);
// Training loop
for (int epoch = 0; epoch < NUM_EPOCHS; ++epoch) {
float epoch_loss = 0.0;
int num_correct = 0;
for (size_t start = 0; start < combined_dataset.size(); start += BATCH_SIZE) {
size_t end = std::min(start + BATCH_SIZE, combined_dataset.size());
std::vector<std::vector<float>> batch_inputs;
std::vector<std::vector<float>> batch_targets;
for (size_t i = start; i < end; ++i) {
const auto& item = combined_dataset[i];
std::vector<float> input(item.image.total());
std::memcpy(input.data(), item.image.ptr<float>(), item.image.total() * sizeof(float));
batch_inputs.push_back(input);
std::vector<float> target(layer_sizes.back(), 0.0f);
target[item.label] = 1.0f; // One-hot encoding
batch_targets.push_back(target);
}
for (size_t i = 0; i < batch_inputs.size(); ++i) {
std::vector<float> output = nn.forward(batch_inputs[i]);
nn.backward(batch_inputs[i], batch_targets[i], LEARNING_RATE);
int predicted_label = std::max_element(output.begin(), output.end()) - output.begin();
if (predicted_label == std::distance(batch_targets[i].begin(), std::max_element(batch_targets[i].begin(), batch_targets[i].end()))) {
num_correct++;
}
}
}
std::cout << "Epoch " << epoch << ": Accuracy = " << (num_correct / static_cast<float>(combined_dataset.size())) << std::endl;
}
// Loading and testing on the test set
std::vector<CIFAR10Image> testset = load_cifar10_bin(TEST_PATH);
std::vector<cv::Mat> test_images;
std::vector<int> test_labels;
for (const auto& item : testset) {
test_images.push_back(preprocess(item.image));
test_labels.push_back(item.label);
}
int num_correct_test = 0;
for (size_t i = 0; i < test_images.size(); ++i) {
const auto& item = combined_dataset[i];
std::vector<float> input(item.image.total());
std::memcpy(input.data(), item.image.ptr<float>(), item.image.total() * sizeof(float));
std::vector<float> output = nn.forward(input);
int predicted_label = std::max_element(output.begin(), output.end()) - output.begin();
if (predicted_label == test_labels[i]) {
num_correct_test++;
}
}
std::cout << "Test Accuracy = " << (num_correct_test / static_cast<float>(test_images.size())) << std::endl;
return 0;
}
and these came together pretty good, yet i still can’t optimize further with only utilizing CPUs.

see u all on my next project.

