You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
firs tI tried it in terminal using this code
and generate two encoder and decoder files
import torch
from PIL import Image
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, BertJapaneseTokenizer
from collections import namedtuple
# Define a named tuple for encoder output
EncoderOutput = namedtuple('EncoderOutput', ['last_hidden_state'])
# Define a wrapper for the encoder
class EncoderWrapper(torch.nn.Module):
def __init__(self, encoder):
super().__init__()
self.encoder = encoder
def forward(self, pixel_values):
encoder_outputs = self.encoder(pixel_values)
return EncoderOutput(last_hidden_state=encoder_outputs.last_hidden_state)
# Define a wrapper for the decoder
class DecoderWrapper(torch.nn.Module):
def __init__(self, decoder):
super().__init__()
self.decoder = decoder
def forward(self, input_ids, encoder_hidden_states, attention_mask):
decoder_outputs = self.decoder(input_ids=input_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=attention_mask)
return decoder_outputs.logits
# Load the model from the local directory
model_path = "/Users/Basil/Downloads/manga-ocr-base"
model = VisionEncoderDecoderModel.from_pretrained(model_path)
tokenizer = BertJapaneseTokenizer.from_pretrained(model_path)
# Define the transformation
transform = transforms.Compose([
transforms.Resize((224, 224)), # Resize the image to match the model input size
transforms.ToTensor(), # Convert the image to a tensor
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the image
])
# Load and preprocess the image
image_path = "/Users/Basil/Downloads/IMG_0604.jpg"
image = Image.open(image_path).convert("RGB")
input_tensor = transform(image).unsqueeze(0) # Add batch dimension
# Set the model to evaluation mode
model.eval()
# Wrap the encoder and decoder
encoder_wrapper = EncoderWrapper(model.encoder)
decoder_wrapper = DecoderWrapper(model.decoder)
# Run encoder
with torch.no_grad():
encoder_outputs = encoder_wrapper(input_tensor)
# Initialize decoder inputs
decoder_input_ids = torch.tensor([[tokenizer.cls_token_id]])
generated_ids = []
# Generate tokens step by step
for _ in range(128): # Generate up to 128 tokens
with torch.no_grad():
decoder_logits = decoder_wrapper(
input_ids=decoder_input_ids,
encoder_hidden_states=encoder_outputs.last_hidden_state,
attention_mask=torch.ones(encoder_outputs.last_hidden_state.size()[:-1], dtype=torch.long)
)
# Get the most probable token
next_token_id = torch.argmax(decoder_logits[:, -1, :], dim=-1).unsqueeze(-1)
# Append generated token
generated_ids.append(next_token_id.item())
# Prepare the input for the next step
decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=1)
# Stop if end of sequence token is generated
if next_token_id.item() == tokenizer.sep_token_id:
break
# Decode the token indices to a string
output_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
print("Output text:", output_text)
# Save the scripted encoder and decoder for CoreML conversion
traced_encoder = torch.jit.trace(encoder_wrapper, (input_tensor,))
traced_decoder = torch.jit.trace(decoder_wrapper, (decoder_input_ids, encoder_outputs.last_hidden_state, torch.ones(encoder_outputs.last_hidden_state.size()[:-1], dtype=torch.long)))
traced_encoder.save("/Users/Basil/Documents/scripted_encoder.pt")
traced_decoder.save("/Users/Basil/Documents/scripted_decoder.pt")
then convert it to coreML
import coremltools as ct
import torch
import numpy as np
# Load the traced models
scripted_encoder = torch.jit.load("/Users/Basil/Documents/scripted_encoder.pt")
scripted_decoder = torch.jit.load("/Users/Basil/Documents/scripted_decoder.pt")
# Ensure models are in eval mode
scripted_encoder.eval()
scripted_decoder.eval()
# Convert the encoder model to CoreML
encoder_input = torch.rand(1, 3, 224, 224)
encoder_mlmodel = ct.convert(
scripted_encoder,
inputs=[ct.ImageType(name="input", shape=encoder_input.shape)], # Specify image input type
convert_to="mlprogram"
)
encoder_mlmodel.save("/Users/Basil/Documents/EncoderModel.mlpackage")
# Convert the decoder model to CoreML
decoder_input = torch.randint(0, 30522, (1, 128)) # Example input tensor, adjust as necessary
encoder_output = torch.rand(1, 197, 768) # Example encoder output tensor, adjust as necessary
attention_mask = torch.ones(1, 197) # Example attention mask, adjust as necessary
decoder_mlmodel = ct.convert(
scripted_decoder,
inputs=[
ct.TensorType(name="input_ids", shape=decoder_input.shape, dtype=np.int32),
ct.TensorType(name="encoder_hidden_states", shape=encoder_output.shape, dtype=np.float32),
ct.TensorType(name="attention_mask", shape=attention_mask.shape, dtype=np.float32)
],
convert_to="mlprogram"
)
decoder_mlmodel.save("/Users/Basil/Documents/DecoderModel.mlpackage")
the converted done correctly
I tried
inputs=[ct.TensorType(shape=encoder_input.shape)]
but it give me issue
The model does not have a valid input feature of type image
so I change it to inputs=[ct.ImageType(name="input", shape=encoder_input.shape)], # Specify image input type
when I tried the code
the decoder model give me
Element 0: nan
Element 1: nan
Element 2: nan
Element 3: nan
Element 4: nan
Element 5: nan
Element 6: nan
Element 7: nan
Element 8: nan
Element 9: nan
did I do something wrong, or does this model cannot converted to CoreML ?
I'm tried to convert model to coreML
firs tI tried it in terminal using this code
and generate two encoder and decoder files
then convert it to coreML
the converted done correctly
I tried
inputs=[ct.TensorType(shape=encoder_input.shape)]
but it give me issue
so I change it to
inputs=[ct.ImageType(name="input", shape=encoder_input.shape)], # Specify image input type
when I tried the code
the decoder model give me
Element 0: nan
Element 1: nan
Element 2: nan
Element 3: nan
Element 4: nan
Element 5: nan
Element 6: nan
Element 7: nan
Element 8: nan
Element 9: nan
did I do something wrong, or does this model cannot converted to CoreML ?
this is the model that I tried to converted
The text was updated successfully, but these errors were encountered: