@@ -56,7 +56,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5656 std::shared_ptr<CLIPTextModelRunner> text_model2;
5757
5858 std::string trigger_word = " img" ; // should be user settable
59- std::string embd_dir ;
59+ std::map<std:: string, std::string> embedding_map ;
6060 int32_t num_custom_embeddings = 0 ;
6161 int32_t num_custom_embeddings_2 = 0 ;
6262 std::vector<uint8_t > token_embed_custom;
@@ -65,11 +65,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6565 FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend,
6666 bool offload_params_to_cpu,
6767 const String2TensorStorage& tensor_storage_map,
68- const std::string& embd_dir ,
68+ const std::map<std:: string, std::string>& orig_embedding_map ,
6969 SDVersion version = VERSION_SD1,
7070 PMVersion pv = PM_VERSION_1)
71- : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ), embd_dir(embd_dir) {
72- bool force_clip_f32 = embd_dir.size () > 0 ;
71+ : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ) {
72+ for (const auto & kv : orig_embedding_map) {
73+ std::string name = kv.first ;
74+ std::transform (name.begin (), name.end (), name.begin (), [](unsigned char c) { return std::tolower (c); });
75+ embedding_map[name] = kv.second ;
76+ tokenizer.add_special_token (name);
77+ }
78+ bool force_clip_f32 = !embedding_map.empty ();
7379 if (sd_version_is_sd1 (version)) {
7480 text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true , force_clip_f32);
7581 } else if (sd_version_is_sd2 (version)) {
@@ -196,25 +202,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
196202
197203 std::vector<int > convert_token_to_id (std::string text) {
198204 auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
199- size_t word_end = str.find (" ," );
200- std::string embd_name = word_end == std::string::npos ? str : str.substr (0 , word_end);
201- embd_name = trim (embd_name);
202- std::string embd_path = get_full_path (embd_dir, embd_name + " .pt" );
203- if (embd_path.size () == 0 ) {
204- embd_path = get_full_path (embd_dir, embd_name + " .ckpt" );
205+ auto iter = embedding_map.find (str);
206+ if (iter == embedding_map.end ()) {
207+ return false ;
205208 }
206- if (embd_path.size () == 0 ) {
207- embd_path = get_full_path (embd_dir, embd_name + " .safetensors" );
208- }
209- if (embd_path.size () > 0 ) {
210- if (load_embedding (embd_name, embd_path, bpe_tokens)) {
211- if (word_end != std::string::npos) {
212- str = str.substr (word_end);
213- } else {
214- str = " " ;
215- }
216- return true ;
217- }
209+ std::string embedding_path = iter->second ;
210+ if (load_embedding (str, embedding_path, bpe_tokens)) {
211+ return true ;
218212 }
219213 return false ;
220214 };
@@ -245,25 +239,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
245239 }
246240
247241 auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
248- size_t word_end = str.find (" ," );
249- std::string embd_name = word_end == std::string::npos ? str : str.substr (0 , word_end);
250- embd_name = trim (embd_name);
251- std::string embd_path = get_full_path (embd_dir, embd_name + " .pt" );
252- if (embd_path.size () == 0 ) {
253- embd_path = get_full_path (embd_dir, embd_name + " .ckpt" );
254- }
255- if (embd_path.size () == 0 ) {
256- embd_path = get_full_path (embd_dir, embd_name + " .safetensors" );
242+ auto iter = embedding_map.find (str);
243+ if (iter == embedding_map.end ()) {
244+ return false ;
257245 }
258- if (embd_path.size () > 0 ) {
259- if (load_embedding (embd_name, embd_path, bpe_tokens)) {
260- if (word_end != std::string::npos) {
261- str = str.substr (word_end);
262- } else {
263- str = " " ;
264- }
265- return true ;
266- }
246+ std::string embedding_path = iter->second ;
247+ if (load_embedding (str, embedding_path, bpe_tokens)) {
248+ return true ;
267249 }
268250 return false ;
269251 };
@@ -376,25 +358,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
376358 }
377359
378360 auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
379- size_t word_end = str.find (" ," );
380- std::string embd_name = word_end == std::string::npos ? str : str.substr (0 , word_end);
381- embd_name = trim (embd_name);
382- std::string embd_path = get_full_path (embd_dir, embd_name + " .pt" );
383- if (embd_path.size () == 0 ) {
384- embd_path = get_full_path (embd_dir, embd_name + " .ckpt" );
385- }
386- if (embd_path.size () == 0 ) {
387- embd_path = get_full_path (embd_dir, embd_name + " .safetensors" );
361+ auto iter = embedding_map.find (str);
362+ if (iter == embedding_map.end ()) {
363+ return false ;
388364 }
389- if (embd_path.size () > 0 ) {
390- if (load_embedding (embd_name, embd_path, bpe_tokens)) {
391- if (word_end != std::string::npos) {
392- str = str.substr (word_end);
393- } else {
394- str = " " ;
395- }
396- return true ;
397- }
365+ std::string embedding_path = iter->second ;
366+ if (load_embedding (str, embedding_path, bpe_tokens)) {
367+ return true ;
398368 }
399369 return false ;
400370 };
@@ -1728,7 +1698,7 @@ struct LLMEmbedder : public Conditioner {
17281698 std::vector<std::pair<int , ggml_tensor*>> image_embeds;
17291699 std::pair<int , int > prompt_attn_range;
17301700 int prompt_template_encode_start_idx = 34 ;
1731- int max_length = 0 ;
1701+ int max_length = 0 ;
17321702 std::set<int > out_layers;
17331703 if (llm->enable_vision && conditioner_params.ref_images .size () > 0 ) {
17341704 LOG_INFO (" QwenImageEditPlusPipeline" );
@@ -1828,7 +1798,7 @@ struct LLMEmbedder : public Conditioner {
18281798 prompt += " [/INST]" ;
18291799 } else if (version == VERSION_OVIS_IMAGE) {
18301800 prompt_template_encode_start_idx = 28 ;
1831- max_length = prompt_template_encode_start_idx + 256 ;
1801+ max_length = prompt_template_encode_start_idx + 256 ;
18321802
18331803 prompt = " <|im_start|>user\n Describe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:" ;
18341804
0 commit comments