Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ StarVector is a multimodal vision-language model for Scalable Vector Graphics (S

### Multimodal Architecture

StarVector uses a multimodal architecture to process images and text. When performing Image-to-SVG (or image vectorization), the image is projected into visual tokens, and SVG code is generated. When performing Text-to-SVG, the model only recieves the text instruction (no image is provided), and a novel SVG is created. The LLM is based of StarCoder, which we leverage to transfer coding skills to SVG generation.
StarVector uses a multimodal architecture to process images and text. When performing Image-to-SVG (or image vectorization), the image is projected into visual tokens, and SVG code is generated. When performing Text-to-SVG, the model only receives the text instruction (no image is provided), and a novel SVG is created. The LLM is based of StarCoder, which we leverage to transfer coding skills to SVG generation.

<div align="center">
<img src="assets/starvector-arch.png" alt="starvector" style="width: 700px; display: block; margin-left: auto; margin-right: auto;" />
Expand Down
4 changes: 2 additions & 2 deletions starvector/metrics/inception.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class InceptionV3(nn.Module):
# Maps feature dimensionality to their output blocks indices
BLOCK_INDEX_BY_DIM = {
64: 0, # First max pooling features
192: 1, # Second max pooling featurs
192: 1, # Second max pooling features
768: 2, # Pre-aux classifier features
2048: 3 # Final average pooling features
}
Expand Down Expand Up @@ -171,7 +171,7 @@ def _inception_v3(*args, **kwargs):
# Just a caution against weird version strings
version = (0,)

# Skips default weight inititialization if supported by torchvision
# Skips default weight initialization if supported by torchvision
# version. See https://github.com/mseitzer/pytorch-fid/issues/28.
if version >= (0, 6):
kwargs['init_weights'] = False
Expand Down
2 changes: 1 addition & 1 deletion starvector/model/gpt_bigcode/modeling_gpt_bigcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

Expand Down
2 changes: 1 addition & 1 deletion starvector/model/llm/starcoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, config, **kwargs):

def init_tokenizer(self, model_name):
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Incude padding and eos tokens in the vocabulary
# Include padding and eos tokens in the vocabulary
if self.tokenizer.eos_token_id is None:
self.tokenizer.add_special_tokens({"eos_token": "[EOS]"})
if self.tokenizer.pad_token_id is None:
Expand Down
2 changes: 1 addition & 1 deletion starvector/model/llm/starcoder2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, config, **kwargs):

def init_tokenizer(self, model_name):
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
# Incude padding and eos tokens in the vocabulary
# Include padding and eos tokens in the vocabulary
if self.tokenizer.eos_token_id is None:
self.tokenizer.add_special_tokens({"eos_token": "[EOS]"})
if self.tokenizer.pad_token_id is None:
Expand Down
2 changes: 1 addition & 1 deletion starvector/train/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def setup_train_env_variables(config):

def load_fsdp_plugin(config, model):
if config.fsdp.enable:
# get mixed precsion dtype
# get mixed precision dtype
mixed_precision_dtype = {
"fp16": torch.float16,
"bf16": torch.bfloat16,
Expand Down