Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions colpali_engine/collators/visual_retriever_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,11 @@ def __init__(

# If processor is one of the supported types, extract the <image> token id.
if isinstance(self.processor, (ColPaliProcessor,)):
image_token = "<image>"
try:
idx = self.processor.tokenizer.additional_special_tokens.index(image_token)
self.image_token_id = self.processor.tokenizer.additional_special_tokens_ids[idx]
except ValueError:
self.image_token_id = None
if hasattr(self.processor, "image_token_id"):
self.image_token_id = self.processor.image_token_id
else:
image_token = "<image>"
self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(image_token)

# Force padding to be on the right for ColPaliProcessor.
if isinstance(self.processor, ColPaliProcessor) and self.processor.tokenizer.padding_side != "right":
Expand Down
8 changes: 2 additions & 6 deletions colpali_engine/models/gemma3/bigemma3/processing_bigemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,8 @@ class BiGemmaProcessor3(BaseVisualRetrieverProcessor, Gemma3Processor): # noqa:

query_augmentation_token: ClassVar[str] = "<eos>"

def __init__(
self,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
self.tokenizer.padding_side = "left"

@classmethod
Expand Down
9 changes: 2 additions & 7 deletions colpali_engine/models/gemma3/colgemma3/processing_colgemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,8 @@ class ColGemmaProcessor3(BaseVisualRetrieverProcessor, Gemma3Processor):

query_augmentation_token: ClassVar[str] = "<eos>"

def __init__(
self,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
# Set padding side to left (important for decoder-only models)
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, **kwargs)
self.tokenizer.padding_side = "left"

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, config, mask_non_image_embeddings: bool = False):
self.linear = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
self.mask_non_image_embeddings = mask_non_image_embeddings
self.main_input_name = "doc_input_ids"
self.post_init()

def forward(self, *args, **kwargs):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ class ColIdefics3Processor(
image_token: ClassVar[str] = "<image>"
visual_prompt_prefix: ClassVar[str] = "<|im_start|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"

def __init__(self, *args, image_seq_len=64, **kwargs):
super().__init__(*args, image_seq_len=image_seq_len, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, image_seq_len=64, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, image_seq_len=image_seq_len, **kwargs)
self.tokenizer.padding_side = "left"

def process_images(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def __init__(self, config, mask_non_image_embeddings: bool = False, **kwargs):
self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
self.mask_non_image_embeddings = mask_non_image_embeddings
self.main_input_name = "doc_input_ids"
self.post_init()

def forward(self, *args, **kwargs):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class ColModernVBertProcessor(
"<|begin_of_text|>User:<image>Describe the image.<end_of_utterance>\nAssistant:"
)

def __init__(self, *args, image_seq_len=64, **kwargs):
super().__init__(*args, image_seq_len=image_seq_len, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, image_seq_len=64, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, image_seq_len=image_seq_len, **kwargs)
self.tokenizer.padding_side = "left"

def process_images(
Expand Down
23 changes: 11 additions & 12 deletions colpali_engine/models/paligemma/colpali/modeling_colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,7 @@
def __init__(self, config: PaliGemmaConfig, mask_non_image_embeddings: bool = False):
super().__init__(config=config)

model = PaliGemmaForConditionalGeneration(config=config)
if model.language_model._tied_weights_keys is not None:
self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
self.model = model
self.model.lm_head = torch.nn.Identity()
self.model = PaliGemmaForConditionalGeneration(config=config)

# TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
# We could do it now but it would break all the models trying to load the model from the checkpoint.
Expand Down Expand Up @@ -75,25 +71,25 @@
return proj

def get_input_embeddings(self):
return self.model.language_model.get_input_embeddings()
return self.model.model.language_model.get_input_embeddings()

def set_input_embeddings(self, value):
self.model.language_model.set_input_embeddings(value)
self.model.model.language_model.set_input_embeddings(value)

def get_output_embeddings(self):
return self.model.language_model.get_output_embeddings()
return self.model.model.language_model.get_output_embeddings()

def set_output_embeddings(self, new_embeddings):
self.model.language_model.set_output_embeddings(new_embeddings)
self.model.model.language_model.set_output_embeddings(new_embeddings)

def set_decoder(self, decoder):
self.model.language_model.set_decoder(decoder)
self.model.model.language_model.set_decoder(decoder)

def get_decoder(self):
return self.model.language_model.get_decoder()
return self.model.model.language_model.get_decoder()

def tie_weights(self):
return self.model.language_model.tie_weights()
return self.model.model.language_model.tie_weights()

def resize_token_embeddings(
self,
Expand All @@ -112,3 +108,6 @@
@property
def patch_size(self) -> int:
return self.model.vision_tower.config.patch_size

def tie_weights(self, missing_keys=None, recompute_mapping=False):

Check failure on line 112 in colpali_engine/models/paligemma/colpali/modeling_colpali.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F811)

colpali_engine/models/paligemma/colpali/modeling_colpali.py:112:9: F811 Redefinition of unused `tie_weights` from line 91: `tie_weights` redefined here
pass
3 changes: 0 additions & 3 deletions colpali_engine/models/paligemma/colpali/processing_colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ class ColPaliProcessor(BaseVisualRetrieverProcessor, PaliGemmaProcessor):

visual_prompt_prefix: ClassVar[str] = "<image><bos>Describe the image."

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@property
def query_augmentation_token(self) -> str:
"""
Expand Down
2 changes: 1 addition & 1 deletion colpali_engine/models/qwen2/colqwen2/modeling_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class ColQwen2(Qwen2VLModel):
def __init__(self, config: Qwen2VLConfig, mask_non_image_embeddings: bool = False):
super().__init__(config=config)
self.dim = 128
self.custom_text_proj = nn.Linear(self.config.hidden_size, self.dim)
self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.dim)
self.padding_side = "left"
self.mask_non_image_embeddings = mask_non_image_embeddings
self.post_init()
Expand Down
8 changes: 2 additions & 6 deletions colpali_engine/models/qwen2/colqwen2/processing_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,8 @@ class ColQwen2Processor(BaseVisualRetrieverProcessor, Qwen2VLProcessor):
query_augmentation_token: ClassVar[str] = "<|endoftext|>"
image_token: ClassVar[str] = "<|image_pad|>"

def __init__(
self,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, image_seq_len=64, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, image_seq_len=image_seq_len, **kwargs)
self.tokenizer.padding_side = "left"

@classmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class ColQwen2_5(Qwen2_5_VLModel): # noqa: N801
def __init__(self, config: Qwen2_5_VLConfig, mask_non_image_embeddings: bool = False):
super().__init__(config=config)
self.dim = 128
self.custom_text_proj = nn.Linear(self.config.hidden_size, self.dim)
self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.dim)
self.padding_side = "left"
self.mask_non_image_embeddings = mask_non_image_embeddings
self.post_init()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,8 @@ class ColQwen2_5_Processor(BaseVisualRetrieverProcessor, Qwen2VLProcessor): # n
query_augmentation_token: ClassVar[str] = "<|endoftext|>"
image_token: ClassVar[str] = "<|image_pad|>"

def __init__(
self,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, image_seq_len=64, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, image_seq_len=image_seq_len, **kwargs)
self.tokenizer.padding_side = "left"

@classmethod
Expand Down
8 changes: 2 additions & 6 deletions colpali_engine/models/qwen3/colqwen3/processing_colqwen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,8 @@ class ColQwen3Processor(BaseVisualRetrieverProcessor, Qwen3VLProcessor):
query_augmentation_token: ClassVar[str] = "<|endoftext|>"
image_token: ClassVar[str] = "<|image_pad|>"

def __init__(
self,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
def __init__(self, image_processor=None, tokenizer=None, image_seq_len=64, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer, image_seq_len=image_seq_len, **kwargs)
self.tokenizer.padding_side = "left"

@classmethod
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ maintainers = [
{ name = "Tony Wu", email = "tony.wu@illuin.tech" },
]
readme = "README.md"
requires-python = ">=3.9"
requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand All @@ -34,13 +34,13 @@ classifiers = [

dependencies = [
"numpy",
"peft>=0.14.0,<0.18.0",
"peft>=0.18.0,<0.19.0",
"pillow>=10.0.0",
"requests",
"scipy",
"torch>=2.2.0,<2.10.0",
"torchvision",
"transformers>=4.57.0,<4.58.0",
"transformers>=5.0.0,<5.1.0",
]

[project.optional-dependencies]
Expand Down
Loading