-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference.py
97 lines (82 loc) · 3.06 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import torch
from torchvision import transforms
from PIL import Image
from models.model import ImageCaptioningModel
import json
class CaptionInference:
def __init__(self, model_path, vocab_path, device='cuda' if torch.cuda.is_available() else 'cpu'):
self.device = device
# Load vocabulary
with open(vocab_path, 'r') as f:
self.vocab = json.load(f)
# Initialize model
self.embed_size = 256 # Make sure these match your training parameters
self.hidden_size = 512
self.vocab_size = len(self.vocab)
# Create and load model
self.model = ImageCaptioningModel(
embed_size=self.embed_size,
hidden_size=self.hidden_size,
vocab_size=self.vocab_size
).to(self.device)
# Load model weights
checkpoint = torch.load(model_path, map_location=self.device)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.model.eval()
# Image preprocessing
self.transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
])
def idx_to_word(self, idx):
for word, index in self.vocab.items():
if index == idx:
return word
return self.vocab['<UNK>']
def generate_caption(self, image_path, max_length=20):
# Load and preprocess image
image = Image.open(image_path).convert('RGB')
image = self.transform(image).unsqueeze(0).to(self.device)
# Generate caption
with torch.no_grad():
caption_indices = self.model.generate_caption(
image,
self.vocab,
max_length=max_length
)
# Convert indices to words
caption_words = []
for idx in caption_indices:
word = self.idx_to_word(idx)
if word == '<END>':
break
if word not in ['<START>', '<PAD>', '<UNK>']:
caption_words.append(word)
return ' '.join(caption_words)
# Example usage
def main():
# Initialize inference
inference = CaptionInference(
model_path='checkpoints/model_epoch_2.pth', # Update with your model path
vocab_path='vocabulary.json' # Update with your vocabulary path
)
# Generate caption for a single image
image_path = 'path/to/your/test/image.jpg' # Update with your image path
caption = inference.generate_caption(image_path)
print(f"Generated caption: {caption}")
# Generate captions for multiple images
test_images = [
'path/to/image1.jpg',
'path/to/image2.jpg',
'path/to/image3.jpg'
]
for img_path in test_images:
caption = inference.generate_caption(img_path)
print(f"\nImage: {img_path}")
print(f"Caption: {caption}")
if __name__ == '__main__':
main()