Skip to content

Commit

Permalink
Inference README update (#4)
Browse files Browse the repository at this point in the history
* chore: Update README.md

* Minor Update README.md

* Update README.md for error fix.

* chore: Update README.md

* Update README.md for demo site deployment and formatting improvements

* chore: Update README.md for formatting improvements

* Update README.md for consistent formatting and language improvements

* Update README.md for consistent formatting and demo site URL

* Update flow_mirror-s model inference code.

* Update README.md for release inference code.

* README update

* requirments update

* 模型适配的几个修改

---------

Co-authored-by: peter65374 <futuretrader@gmail.com>
Co-authored-by: happen <happenmass@gmail.com>
Co-authored-by: jzx-ai-lab <165371609+jzx-ai-lab@users.noreply.github.com>
  • Loading branch information
4 people authored Sep 3, 2024
1 parent 344c185 commit be7dea3
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 15 deletions.
107 changes: 104 additions & 3 deletions flow_mirror_s/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,109 @@
The Readme file is for Flow Mirror-s model inference code repo.
# Flow_mirror_s

## Requirements
```
conda create -n flowmirror python=3.10
conda activate flowmirror
# downgrade pip to 23.1.1 for the requirments of fairseq
pip install pip==23.1.1
pip3 install torch torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt
```

## Model download
### modelscope
```python
from modelscope import snapshot_download
snapshot_download('jzx-ai-lab/Flow_mirror', local_dir='jzx-ai-lab/Flow_mirror')
```
## Load flow_mirror model
### Load model

```python
from flow_mirror_model import FlowmirrorForConditionalGeneration
from hubert_kmeans import HubertCodeExtractor
from transformers import AutoTokenizer

ckpt_path = "jzx-ai-lab/Flow_mirror" # download from modelscope or huggingface
model = FlowmirrorForConditionalGeneration.from_pretrained(ckpt_path)
code_extractor = HubertCodeExtractor(
ckpt_path=f"{ckpt_path}/chinese-hubert-ckpt-20240628.pt",
km_path="hubert_kmeans/kmeans_500.pkl",
layer=24,
rank=0
)
tokenizer = AutoTokenizer.from_pretrained(f"{ckpt_path}/tokenizer")

model.eval().to(torch.float16).to("cuda")
```
### Load speaker_embedding from pt
```python
speaker_embeddings = torch.load("hubert_kmeans/speaker_embedding.pt")
```
### Extract speaker_embedding from ref-audio(make sure the sampling rate of the audio is 16k)
```python
from transformers import AutoFeatureExtractor
import soundfile as sf

speaker_encoder = model.speaker_encoder

feature_extractor = AutoFeatureExtractor.from_pretrained("hubert_kmeans")


ref_wav = f"{ckpt_path}/assets/question_example_1_MP3.mp3"
reference_audio_input = feature_extractor(sf.read(wav_example)[0],sampling_rate=16000, return_tensors="pt").to("cuda")
speaker_embedding = speaker_encoder.encode(reference_audio_input['input_values'])
```

## Inference Code
```python
def deduplicates(cluster_ids):
dup_cluster_list = []
count = 1
for i in range(0, len(cluster_ids)):
if i + 1 < len(cluster_ids) and cluster_ids[i] == cluster_ids[i+1]:
count += 1
else:
dup_cluster_list.append(cluster_ids[i])
count = 1
return dup_cluster_list

def convert_label_to_text(label):
text = ""
for i in label:
text += f"<|audio_{i}|>"
return text

# extract code token from hubert feature
feats = code_extractor.get_feats(f"{ckpt_path}/assets/question_example_1_MP3.mp3")
codes = code_extractor.dump_label(feats)

codes = deduplicates(codes)
label_text = convert_label_to_text(codes)

# apply mode generation template
prompt = f"<|spk_embed|><|startofaudio|>{label_text}<|endofaudio|><|startofcont|>"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# define generation config
gen_kwargs = {
"do_sample": True,
"temperature": 0.9,
"max_new_tokens": 512,
"use_cache": True,
"min_new_tokens": 9 + 1,
}

generation, text_completion = model.generate(prompt_input_ids=input_ids.to("cuda"),speaker_embedding=speaker_embedding.to(model.dtype).to(model.device), **gen_kwargs)

## Usage
audio_arr = generation.float().cpu().numpy().squeeze()

## Examples
# print generated text
print(tokenizer.decode(text_completion[0]))
# save generated audio
sf.write("answer.wav", audio_arr, 16000)
```

3 changes: 2 additions & 1 deletion flow_mirror_s/flow_mirror_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@
from .dac_wrapper import DACConfig, DACModel
from transformers import AutoConfig, AutoModel

AutoConfig.register("dac", DACConfig)

AutoConfig.register("cac", DACConfig)
AutoModel.register(DACConfig, DACModel)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@


class DACConfig(PretrainedConfig):
model_type = "dac"
model_type = "cac"


def __init__(
self,
Expand Down
20 changes: 15 additions & 5 deletions flow_mirror_s/flow_mirror_model/modeling_flow_mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)

from .configuration_flow_mirror import FlowmirrorConfig, FlowmirrorDecoderConfig
Expand Down Expand Up @@ -1056,7 +1055,7 @@ def set_decoder(self, decoder):
def get_decoder(self):
return self.model.decoder

@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)

def forward(
self,
input_ids: torch.LongTensor = None,
Expand Down Expand Up @@ -1333,7 +1332,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):

return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)

def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
Expand Down Expand Up @@ -1523,7 +1522,12 @@ def _prepare_decoder_input_ids_for_generation(
decoder_input_ids = None

# 2. Encoder-decoder models expect the `decoder_input_ids` to start with a special token. Let's ensure that.
decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)

assert decoder_start_token_id == bos_token_id, (
"Make sure that `decoder_start_token_id` is correctly defined and that it is the same as `bos_token_id`."
"Otherwise, the model will not behave as expected."
)

if device is None:
device = self.device
decoder_input_ids_start = (
Expand Down Expand Up @@ -1897,6 +1901,10 @@ def generate(
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None


generation_config._eos_token_tensor = None


# 9. prepare distribution pre_processing samplers
logits_processor = self._get_logits_processor(
generation_config=generation_config,
Expand Down Expand Up @@ -1931,7 +1939,9 @@ def generate(

elif is_sample_gen_mode:
# 11. prepare logits warper
logits_warper = self._get_logits_warper(generation_config)

logits_warper = self._get_logits_warper(generation_config, self.device)

# expand input_ids with `num_return_sequences` additional sequences per batch
input_ids, model_kwargs = self._expand_inputs_for_generation(
input_ids=input_ids,
Expand Down
6 changes: 1 addition & 5 deletions flow_mirror_s/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
accelerate==0.32.1
datasets==2.18.0
descript_audio_codec==1.0.0
evaluate==0.4.2
Expand All @@ -8,11 +7,8 @@ ninja==1.11.1.1
joblib==1.4.2
librosa==0.10.2.post1
npy_append_array==0.9.16
numpy==2.1.0
numpy==2.0.1
safetensors==0.4.4
soundfile==0.12.1
torch==2.4.0+cu118
torchaudio==2.4.0+cu118
torchvision==0.19.0+cu118
tqdm==4.66.4
transformers==4.44.2

0 comments on commit be7dea3

Please sign in to comment.