Codsoft/Task2.cpp at main · DwivediAnkush/Codsoft · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*IMAGE CAPTIONING

Combine computer vision and natural language processing to build
an image captioning AI. Use pre-trained image recognition models
like VGG or ResNet to extract features from images, and then use a
recurrent neural network (RNN) or transformer-based model to
generate captions for those images.
*/

#include <torch/torch.h> // Include PyTorch library for deep learning
#include <torch/script.h> // Include PyTorch's JIT compiler
#include <opencv2/opencv.hpp> // Include OpenCV library for image processing
#include <fstream> // Include fstream for file operations
#include <iostream> // Include iostream for input and output operations
#include <string> // Include string for string operations

using namespace std; // Use the standard namespace
using namespace cv; // Use the OpenCV namespace

// Define a function to load an image and preprocess it
torch::Tensor load_image(const std::string& image_path) {
    // Load the image using OpenCV
    Mat image = imread(image_path, IMREAD_COLOR); // Load image in color mode
    if (image.empty()) { // Check if the image is loaded successfully
        cerr << "Could not open or find the image!\n" << endl; // Print an error message
        return torch::Tensor(); // Return an empty tensor
    }

    // Resize the image to 224x224 pixels
    resize(image, image, Size(224, 224)); // Resize the image

    // Convert the image to a tensor and normalize it
    auto img_tensor = torch::from_blob(image.data, {1, image.rows, image.cols, 3}, torch::kByte); // Convert image to tensor
    img_tensor = img_tensor.permute({0, 3, 1, 2}); // Change dimensions to {1, 3, 224, 224}
    img_tensor = img_tensor.to(torch::kFloat32).div(255); // Convert to float and normalize
    img_tensor = torch::data::transforms::Normalize<>( // Normalize the image
        {0.485, 0.456, 0.406}, {0.229, 0.224, 0.225}
    )(img_tensor);

    return img_tensor; // Return the preprocessed image tensor
}

// Define the RNN model for caption generation
struct CaptionGeneratorRNN : torch::nn::Module {
    torch::nn::Embedding embed{nullptr}; // Embedding layer
    torch::nn::LSTM lstm{nullptr}; // LSTM layer
    torch::nn::Linear linear{nullptr}; // Linear layer

    CaptionGeneratorRNN(int64_t vocab_size, int64_t embed_size, int64_t hidden_size, int64_t num_layers) {
        embed = register_module("embed", torch::nn::Embedding(vocab_size, embed_size)); // Register the embedding layer
        lstm = register_module("lstm", torch::nn::LSTM(torch::nn::LSTMOptions(embed_size, hidden_size).num_layers(num_layers).batch_first(true))); // Register the LSTM layer
        linear = register_module("linear", torch::nn::Linear(hidden_size, vocab_size)); // Register the linear layer
    }

    torch::Tensor forward(torch::Tensor features, torch::Tensor captions) {
        auto embeddings = embed(captions); // Embed the captions
        embeddings = torch::cat({features.unsqueeze(1), embeddings}, 1); // Concatenate image features with embeddings
        auto lstm_out = lstm(embeddings); // Pass through the LSTM
        auto outputs = linear(std::get<0>(lstm_out)); // Get word scores from the linear layer
        return outputs; // Return the outputs
    }
};

// Function to generate a caption for an image
std::string generate_caption(const std::string& image_path, torch::jit::script::Module& feature_extractor, CaptionGeneratorRNN& caption_generator, const std::vector<std::string>& vocab) {
    // Load and preprocess the image
    auto image_tensor = load_image(image_path);

    // Extract image features using the pre-trained model
    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(image_tensor);
    auto image_features = feature_extractor.forward(inputs).toTensor().squeeze();

    // Generate a caption using the RNN model (this is a simplified example; training the model is necessary for actual use)
    std::vector<int64_t> caption_indices = {0}; // Start token (dummy example)
    torch::Tensor captions = torch::tensor(caption_indices).unsqueeze(0); // Convert to tensor

    auto outputs = caption_generator.forward(image_features, captions); // Generate caption
    auto predicted_indices = outputs.argmax(2).squeeze().tolist<int64_t>(); // Get the predicted word indices

    // Convert indices to words
    std::string caption;
    for (const auto& idx : predicted_indices) {
        if (idx < vocab.size()) { // Check if index is within the vocabulary size
            caption += vocab[idx] + " "; // Append the word to the caption
        }
    }

    return caption; // Return the generated caption
}

int main() {
    // Load the pre-trained ResNet model for feature extraction
    torch::jit::script::Module feature_extractor;
    try {
        feature_extractor = torch::jit::load("resnet50.pt"); // Load the ResNet model
    } catch (const c10::Error& e) {
        cerr << "Error loading the model\n"; // Print an error message if the model fails to load
        return -1; // Return with an error code
    }

    // Vocabulary (dummy example, in practice use a proper tokenizer)
    std::vector<std::string> vocab = {"<start>", "a", "cat", "sitting", "on", "couch", "<end>"}; // Dummy vocabulary

    // Create the RNN model for caption generation
    int64_t vocab_size = vocab.size(); // Vocabulary size
    int64_t embed_size = 256; // Embedding size
    int64_t hidden_size = 512; // LSTM hidden size
    int64_t num_layers = 1; // Number of LSTM layers
    CaptionGeneratorRNN caption_generator(vocab_size, embed_size, hidden_size, num_layers);

    // Example usage
    std::string image_path = "path_to_your_image.jpg"; // Path to the input image
    std::string caption = generate_caption(image_path, feature_extractor, caption_generator, vocab); // Generate a caption for the image
    cout << "Generated Caption: " << caption << endl; // Print the generated caption

    return 0; // Return with success
}