Inference README update (#4)

* chore: Update README.md * Minor Update README.md * Update README.md for error fix. * chore: Update README.md * Update README.md for demo site deployment and formatting improvements * chore: Update README.md for formatting improvements * Update README.md for consistent formatting and language improvements * Update README.md for consistent formatting and demo site URL * Update flow_mirror-s model inference code. * Update README.md for release inference code. * README update * requirments update * 模型适配的几个修改 --------- Co-authored-by: peter65374 <futuretrader@gmail.com> Co-authored-by: happen <happenmass@gmail.com> Co-authored-by: jzx-ai-lab <165371609+jzx-ai-lab@users.noreply.github.com>
jingzhunxue · Sep 3, 2024 · be7dea3 · be7dea3
1 parent 344c185
commit be7dea3
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 15 deletions.
diff --git a/flow_mirror_s/README.md b/flow_mirror_s/README.md
@@ -1,8 +1,109 @@
-The Readme file is for Flow Mirror-s model inference code repo.  
+# Flow_mirror_s
+
+## Requirements
+```
+conda create -n flowmirror python=3.10
+conda activate flowmirror
+
+# downgrade pip to 23.1.1 for the requirments of fairseq
+pip install pip==23.1.1
+
+pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install -r requirements.txt
+```
+
+## Model download
+### modelscope
+```python
+from modelscope import snapshot_download
+snapshot_download('jzx-ai-lab/Flow_mirror', local_dir='jzx-ai-lab/Flow_mirror')
+```
+## Load flow_mirror model
+### Load model
+
+```python
+from flow_mirror_model import FlowmirrorForConditionalGeneration
+from hubert_kmeans import HubertCodeExtractor
+from transformers import AutoTokenizer
+
+ckpt_path = "jzx-ai-lab/Flow_mirror" # download from modelscope or huggingface 
+model = FlowmirrorForConditionalGeneration.from_pretrained(ckpt_path)
+code_extractor = HubertCodeExtractor(
+    ckpt_path=f"{ckpt_path}/chinese-hubert-ckpt-20240628.pt",
+    km_path="hubert_kmeans/kmeans_500.pkl",
+    layer=24,
+    rank=0
+)
+tokenizer = AutoTokenizer.from_pretrained(f"{ckpt_path}/tokenizer")
+
+model.eval().to(torch.float16).to("cuda")
+```
+### Load speaker_embedding from pt
+```python
+speaker_embeddings = torch.load("hubert_kmeans/speaker_embedding.pt")
+```
+### Extract speaker_embedding from ref-audio(make sure the sampling rate of the audio is 16k)
+```python
+from transformers import AutoFeatureExtractor
+import soundfile as sf
+
+speaker_encoder = model.speaker_encoder
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("hubert_kmeans")
+
+
+ref_wav = f"{ckpt_path}/assets/question_example_1_MP3.mp3"
+reference_audio_input = feature_extractor(sf.read(wav_example)[0],sampling_rate=16000, return_tensors="pt").to("cuda")
+speaker_embedding = speaker_encoder.encode(reference_audio_input['input_values'])
+```
 
 ## Inference Code
+```python
+def deduplicates(cluster_ids):
+    dup_cluster_list = []
+    count = 1
+    for i in range(0, len(cluster_ids)):
+        if i + 1 < len(cluster_ids) and cluster_ids[i] == cluster_ids[i+1]:
+            count += 1
+        else:
+            dup_cluster_list.append(cluster_ids[i])
+            count = 1
+    return dup_cluster_list
+
+def convert_label_to_text(label):
+    text = ""
+    for i in label:
+        text += f"<|audio_{i}|>"
+    return text
+
+# extract code token from hubert feature
+feats = code_extractor.get_feats(f"{ckpt_path}/assets/question_example_1_MP3.mp3")
+codes = code_extractor.dump_label(feats)
+
+codes = deduplicates(codes)
+label_text = convert_label_to_text(codes)
+
+# apply mode generation template
+prompt = f"<|spk_embed|><|startofaudio|>{label_text}<|endofaudio|><|startofcont|>"
+
+input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+# define generation config
+gen_kwargs = {
+    "do_sample": True,
+    "temperature": 0.9,
+    "max_new_tokens": 512,
+    "use_cache": True,
+    "min_new_tokens": 9 + 1,
+}
+
+generation, text_completion = model.generate(prompt_input_ids=input_ids.to("cuda"),speaker_embedding=speaker_embedding.to(model.dtype).to(model.device), **gen_kwargs)
 
-## Usage
+audio_arr = generation.float().cpu().numpy().squeeze()
 
-## Examples
+# print generated text
+print(tokenizer.decode(text_completion[0]))
+# save generated audio
+sf.write("answer.wav", audio_arr, 16000)
+```
 
diff --git a/flow_mirror_s/flow_mirror_model/__init__.py b/flow_mirror_s/flow_mirror_model/__init__.py
@@ -12,5 +12,6 @@
 from .dac_wrapper import DACConfig, DACModel
 from transformers import AutoConfig, AutoModel
 
-AutoConfig.register("dac", DACConfig)
+
+AutoConfig.register("cac", DACConfig)
 AutoModel.register(DACConfig, DACModel)
diff --git a/flow_mirror_s/flow_mirror_model/dac_wrapper/configuration_dac.py b/flow_mirror_s/flow_mirror_model/dac_wrapper/configuration_dac.py
@@ -3,7 +3,8 @@
 
 
 class DACConfig(PretrainedConfig):
-    model_type = "dac"
+    model_type = "cac"
+
 
     def __init__(
         self,

diff --git a/flow_mirror_s/flow_mirror_model/modeling_flow_mirror.py b/flow_mirror_s/flow_mirror_model/modeling_flow_mirror.py
@@ -33,7 +33,6 @@
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
-    replace_return_docstrings,
 )
 
 from .configuration_flow_mirror import FlowmirrorConfig, FlowmirrorDecoderConfig
@@ -1056,7 +1055,7 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model.decoder
 
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -1333,7 +1332,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1523,7 +1522,12 @@ def _prepare_decoder_input_ids_for_generation(
             decoder_input_ids = None
 
         # 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
-        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+
+        assert decoder_start_token_id == bos_token_id, (
+            "Make sure that `decoder_start_token_id` is correctly defined and that it is the same as `bos_token_id`."
+            "Otherwise, the model will not behave as expected."
+        )
+
         if device is None:
             device = self.device
         decoder_input_ids_start = (
@@ -1897,6 +1901,10 @@ def generate(
             logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
             generation_config.guidance_scale = None
 
+
+        generation_config._eos_token_tensor = None
+
+
         # 9. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             generation_config=generation_config,
@@ -1931,7 +1939,9 @@ def generate(
 
         elif is_sample_gen_mode:
             # 11. prepare logits warper
-            logits_warper = self._get_logits_warper(generation_config)
+
+            logits_warper = self._get_logits_warper(generation_config, self.device)
+
             # expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
                 input_ids=input_ids,

diff --git a/flow_mirror_s/requirements.txt b/flow_mirror_s/requirements.txt
@@ -1,4 +1,3 @@
-accelerate==0.32.1
 datasets==2.18.0
 descript_audio_codec==1.0.0
 evaluate==0.4.2
@@ -8,11 +7,8 @@ ninja==1.11.1.1
 joblib==1.4.2
 librosa==0.10.2.post1
 npy_append_array==0.9.16
-numpy==2.1.0
+numpy==2.0.1
 safetensors==0.4.4
 soundfile==0.12.1
-torch==2.4.0+cu118
-torchaudio==2.4.0+cu118
-torchvision==0.19.0+cu118
 tqdm==4.66.4
 transformers==4.44.2