How to handle Multi Label DataSet from Directory for image captioning in PyTorch
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ height:90px;width:728px;box-sizing:border-box;
}
I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me
Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.
Here they have used Standard COCO Dataset.
I have dataset as images/ and captions/ directory .
Example
Directory Structure:
images/T001.jpg
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....
The above is the relation. Caption file has 'n' number of captions in each separate line.
I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.
Any guidance/suggestion on how to achieving this.
++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:
from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os
class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass
def __getitem__(self,index):
vocab = self.vocab
img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)
caption_all_for_a_image = open(cap_path).read().split("n")
image = Image.open(img_path)
image = image.convert('RGB')
if self.transform != None:
# apply image preprocessing
image = self.transform(image)
#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)
caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])
return image,caption
def __len__(self):
return len(self.all_imgs_path)
+++++++++++++++++++++++++++++++++
python pytorch
add a comment |
I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me
Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.
Here they have used Standard COCO Dataset.
I have dataset as images/ and captions/ directory .
Example
Directory Structure:
images/T001.jpg
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....
The above is the relation. Caption file has 'n' number of captions in each separate line.
I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.
Any guidance/suggestion on how to achieving this.
++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:
from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os
class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass
def __getitem__(self,index):
vocab = self.vocab
img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)
caption_all_for_a_image = open(cap_path).read().split("n")
image = Image.open(img_path)
image = image.convert('RGB')
if self.transform != None:
# apply image preprocessing
image = self.transform(image)
#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)
caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])
return image,caption
def __len__(self):
return len(self.all_imgs_path)
+++++++++++++++++++++++++++++++++
python pytorch
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00
add a comment |
I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me
Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.
Here they have used Standard COCO Dataset.
I have dataset as images/ and captions/ directory .
Example
Directory Structure:
images/T001.jpg
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....
The above is the relation. Caption file has 'n' number of captions in each separate line.
I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.
Any guidance/suggestion on how to achieving this.
++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:
from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os
class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass
def __getitem__(self,index):
vocab = self.vocab
img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)
caption_all_for_a_image = open(cap_path).read().split("n")
image = Image.open(img_path)
image = image.convert('RGB')
if self.transform != None:
# apply image preprocessing
image = self.transform(image)
#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)
caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])
return image,caption
def __len__(self):
return len(self.all_imgs_path)
+++++++++++++++++++++++++++++++++
python pytorch
I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me
Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.
Here they have used Standard COCO Dataset.
I have dataset as images/ and captions/ directory .
Example
Directory Structure:
images/T001.jpg
images/T002.jpg
...
...
captions/T001.txt
captions/T002.txt
....
....
The above is the relation. Caption file has 'n' number of captions in each separate line.
I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.
Any guidance/suggestion on how to achieving this.
++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:
from __future__ import print_function
import torch
from torchvision import datasets, models, transforms
from torchvision import transforms
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
import torch.nn as nn
#from torch import np
import numpy as np
import utils_c
from data_loader_c import get_cust_data_loader
from models import CNN, RNN
from vocab_custom import Vocabulary, load_vocab
import os
class ImageCaptionDataSet(data.Dataset):
def __init__(self, path, json, vocab=None, transform=None):
self.vocab = vocab
self.transform = transform
self.img_dir_path = path
self.cap_dir_path = json
self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))
self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))
pass
def __getitem__(self,index):
vocab = self.vocab
img_path = self.all_imgs_path[index]
img_base_name = os.path.basename(img_path)
cap_base_name = img_base_name.replace(".jpg",".txt")
cap_path = os.path.join(self.cap_dir_path,cap_base_name)
caption_all_for_a_image = open(cap_path).read().split("n")
image = Image.open(img_path)
image = image.convert('RGB')
if self.transform != None:
# apply image preprocessing
image = self.transform(image)
#captions_combined =
#max_len = 0
#for caption in caption_all_for_a_image:
# caption_str = str(caption).lower()
# tokens = nltk.tokenize.word_tokenize(caption_str)
# m = len(tokens) + 2
# if m>max_len:
# max_len = m
# caption = torch.Tensor([vocab(vocab.start_token())] +
# [vocab(token) for token in tokens] +
# [vocab(vocab.end_token())])
# captions_combined.append(caption)
# #yield image, caption
#return image,torch.Tensor(captions_combined)
caption_str = str(caption_all_for_a_image).lower()
tokens = nltk.tokenize.word_tokenize(caption_str)
caption = torch.Tensor([vocab(vocab.start_token())] +
[vocab(token) for token in tokens] +
[vocab(vocab.end_token())])
return image,caption
def __len__(self):
return len(self.all_imgs_path)
+++++++++++++++++++++++++++++++++
python pytorch
python pytorch
asked Nov 23 '18 at 7:43
rajeshkumargprajeshkumargp
2716
2716
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00
add a comment |
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00
add a comment |
1 Answer
1
active
oldest
votes
First, using str()
to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)
) is a bad idea:
cap = ['a sentence', 'bla bla bla']
str(cap)
Returns this sting:
"['a sentence', 'bla bla bla']"
Note that ['
, and ', '
are part of the resulting string!
You can pick one of the captions at random:
import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53442510%2fhow-to-handle-multi-label-dataset-from-directory-for-image-captioning-in-pytorch%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
First, using str()
to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)
) is a bad idea:
cap = ['a sentence', 'bla bla bla']
str(cap)
Returns this sting:
"['a sentence', 'bla bla bla']"
Note that ['
, and ', '
are part of the resulting string!
You can pick one of the captions at random:
import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection
add a comment |
First, using str()
to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)
) is a bad idea:
cap = ['a sentence', 'bla bla bla']
str(cap)
Returns this sting:
"['a sentence', 'bla bla bla']"
Note that ['
, and ', '
are part of the resulting string!
You can pick one of the captions at random:
import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection
add a comment |
First, using str()
to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)
) is a bad idea:
cap = ['a sentence', 'bla bla bla']
str(cap)
Returns this sting:
"['a sentence', 'bla bla bla']"
Note that ['
, and ', '
are part of the resulting string!
You can pick one of the captions at random:
import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection
First, using str()
to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)
) is a bad idea:
cap = ['a sentence', 'bla bla bla']
str(cap)
Returns this sting:
"['a sentence', 'bla bla bla']"
Note that ['
, and ', '
are part of the resulting string!
You can pick one of the captions at random:
import random
...
cap_idx = random.randi(0, len(caption_all_for_a_image)-1) # pick one at random
caption_str = caption_all_for_a_image[cap_idx].lower() # actual selection
answered Nov 23 '18 at 9:51
ShaiShai
71.1k23139250
71.1k23139250
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53442510%2fhow-to-handle-multi-label-dataset-from-directory-for-image-captioning-in-pytorch%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
which of the lines do you want? the first? last? a random one?
– Shai
Nov 23 '18 at 9:33
Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.
– rajeshkumargp
Nov 24 '18 at 5:00