diff --git a/jukebox/Interacting_with_Jukebox.ipynb b/jukebox/Interacting_with_Jukebox.ipynb index 616504d4eb..380910e813 100644 --- a/jukebox/Interacting_with_Jukebox.ipynb +++ b/jukebox/Interacting_with_Jukebox.ipynb @@ -23,7 +23,7 @@ "colab": {} }, "source": [ - "!pip install git+https://github.com/openai/jukebox.git" + "!pip install -q git+https://github.com/openai/jukebox.git" ], "execution_count": 0, "outputs": [] @@ -69,7 +69,6 @@ "import jukebox\n", "import torch as t\n", "import librosa\n", - "import os\n", "from IPython.display import Audio\n", "from jukebox.make_models import make_vqvae, make_prior, MODELS, make_model\n", "from jukebox.hparams import Hyperparams, setup_hparams\n", @@ -100,20 +99,19 @@ "colab": {} }, "source": [ - "model = \"5b_lyrics\" # or \"1b_lyrics\" \n", + "model = \"5b_lyrics\" # or \"1b_lyrics\"\n", "hps = Hyperparams()\n", "hps.sr = 44100\n", - "hps.n_samples = 3 if model=='5b_lyrics' else 8\n", + "hps.n_samples = 3 if model == '5b_lyrics' else 8\n", "hps.name = 'samples'\n", - "chunk_size = 16 if model==\"5b_lyrics\" else 32\n", - "max_batch_size = 3 if model==\"5b_lyrics\" else 16\n", + "chunk_size = 16 if model == \"5b_lyrics\" else 32\n", + "max_batch_size = 3 if model == \"5b_lyrics\" else 16\n", "hps.levels = 3\n", - "hps.hop_fraction = [.5,.5,.125]\n", + "hps.hop_fraction = [.5, .5, .125]\n", "\n", "vqvae, *priors = MODELS[model]\n", - "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length = 1048576)), device)\n", - "top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)\n", - "\n" + "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=1048576)), device)\n", + "top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)" ], "execution_count": 0, "outputs": [] @@ -137,13 +135,13 @@ }, "source": [ "sample_length_in_seconds = 60 # Full length of musical sample to generate - we find songs in the 1 to 4 minute\n", - " # range work well, with generation time proportional to sample length. \n", - " # This total length affects how quickly the model \n", + " # range work well, with generation time proportional to sample length.\n", + " # This total length affects how quickly the model\n", " # progresses through lyrics (model also generates differently\n", " # depending on if it thinks it's in the beginning, middle, or end of sample)\n", "\n", "hps.sample_length = (int(sample_length_in_seconds*hps.sr)//top_prior.raw_to_tokens)*top_prior.raw_to_tokens\n", - "assert hps.sample_length >= top_prior.n_ctx*top_prior.raw_to_tokens, f'Please choose a larger sampling rate'" + "assert hps.sample_length >= top_prior.n_ctx*top_prior.raw_to_tokens, 'Please choose a larger sampling rate'" ], "execution_count": 0, "outputs": [] @@ -156,27 +154,27 @@ "colab": {} }, "source": [ - "metas = [dict(artist = \"Zac Brown Band\",\n", - " genre = \"Country\",\n", - " total_length = hps.sample_length,\n", - " offset = 0,\n", - " lyrics = \"\"\"I met a traveller from an antique land,\n", - " Who said—“Two vast and trunkless legs of stone\n", - " Stand in the desert. . . . Near them, on the sand,\n", - " Half sunk a shattered visage lies, whose frown,\n", - " And wrinkled lip, and sneer of cold command,\n", - " Tell that its sculptor well those passions read\n", - " Which yet survive, stamped on these lifeless things,\n", - " The hand that mocked them, and the heart that fed;\n", - " And on the pedestal, these words appear:\n", - " My name is Ozymandias, King of Kings;\n", - " Look on my Works, ye Mighty, and despair!\n", - " Nothing beside remains. Round the decay\n", - " Of that colossal Wreck, boundless and bare\n", - " The lone and level sands stretch far away\n", - " \"\"\",\n", - " ),\n", - " ] * hps.n_samples\n", + "metas = [dict(artist=\"Zac Brown Band\",\n", + " genre=\"Country\",\n", + " total_length=hps.sample_length,\n", + " offset=0,\n", + " lyrics=\"\"\"I met a traveller from an antique land,\n", + " Who said—“Two vast and trunkless legs of stone\n", + " Stand in the desert. . . . Near them, on the sand,\n", + " Half sunk a shattered visage lies, whose frown,\n", + " And wrinkled lip, and sneer of cold command,\n", + " Tell that its sculptor well those passions read\n", + " Which yet survive, stamped on these lifeless things,\n", + " The hand that mocked them, and the heart that fed;\n", + " And on the pedestal, these words appear:\n", + " My name is Ozymandias, King of Kings;\n", + " Look on my Works, ye Mighty, and despair!\n", + " Nothing beside remains. Round the decay\n", + " Of that colossal Wreck, boundless and bare\n", + " The lone and level sands stretch far away\n", + " \"\"\"\n", + " )\n", + " ] * hps.n_samples\n", "labels = [None, None, top_prior.labeller.get_batch_labels(metas, 'cuda')]" ], "execution_count": 0, @@ -208,10 +206,10 @@ "chunk_size = 16 if model == \"5b_lyrics\" else 32\n", "sampling_kwargs = [dict(temp=.99, fp16=True, max_batch_size=lower_batch_size,\n", " chunk_size=lower_level_chunk_size),\n", - " dict(temp=0.99, fp16=True, max_batch_size=lower_batch_size,\n", - " chunk_size=lower_level_chunk_size),\n", - " dict(temp=sampling_temperature, fp16=True, \n", - " max_batch_size=max_batch_size, chunk_size=chunk_size)]" + " dict(temp=.99, fp16=True, max_batch_size=lower_batch_size,\n", + " chunk_size=lower_level_chunk_size),\n", + " dict(temp=sampling_temperature, fp16=True,\n", + " max_batch_size=max_batch_size, chunk_size=chunk_size)]" ], "execution_count": 0, "outputs": [] @@ -225,7 +223,7 @@ "source": [ "Now we're ready to sample from the model. We'll generate the top level (2) first, followed by the first upsampling (level 1), and the second upsampling (0). In this CoLab we load the top prior separately from the upsamplers, because of memory concerns on the hosted runtimes. If you are using a local machine, you can also load all models directly with make_models, and then use sample.py's ancestral_sampling to put this all in one step.\n", "\n", - "After each level, we decode to raw audio and save the audio files. \n", + "After each level, we decode the raw audio and save the audio files. \n", "\n", "This next cell will take a while (approximately 10 minutes per 20 seconds of music sample)" ] @@ -238,7 +236,8 @@ "colab": {} }, "source": [ - "zs = [t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(len(priors))]\n", + "zs = [t.zeros(hps.n_samples, 0, dtype=t.long, device='cuda')\n", + " for _ in range(len(priors))]\n", "zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)" ], "execution_count": 0, @@ -285,13 +284,14 @@ "colab": {} }, "source": [ - "# Set this False if you are on a local machine that has enough memory (this allows you to do the\n", - "# lyrics alignment visualization during the upsampling stage). For a hosted runtime, \n", - "# we'll need to go ahead and delete the top_prior if you are using the 5b_lyrics model.\n", + "# Set this False if you are on a local machine that has enough memory\n", + "# (this allows you to do the lyrics alignment visualization during the\n", + "# upsampling stage). For a hosted runtime, we'll need to go ahead and delete\n", + "# the top_prior if you are using the 5b_lyrics model.\n", "if True:\n", " del top_prior\n", " empty_cache()\n", - " top_prior=None\n", + " top_prior = None\n", "upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu') for prior in priors[:-1]]\n", "labels[:2] = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in upsamplers]" ], @@ -316,7 +316,7 @@ "colab": {} }, "source": [ - "zs = upsample(zs, labels, sampling_kwargs, [*upsamplers, top_prior], hps)\n" + "zs = upsample(zs, labels, sampling_kwargs, [*upsamplers, top_prior], hps)" ], "execution_count": 0, "outputs": [] @@ -388,19 +388,20 @@ "colab": {} }, "source": [ - "model = \"5b_lyrics\" # or \"1b_lyrics\"\n", + "model = \"5b_lyrics\" # or \"1b_lyrics\"\n", "hps = Hyperparams()\n", "hps.sr = 44100\n", - "hps.n_samples = 3 if model=='5b_lyrics' else 16\n", + "hps.n_samples = 3 if model == '5b_lyrics' else 16\n", "hps.name = 'co_composer'\n", - "hps.sample_length = 1048576 if model==\"5b_lyrics\" else 786432 \n", - "chunk_size = 16 if model==\"5b_lyrics\" else 32\n", - "max_batch_size = 3 if model==\"5b_lyrics\" else 16\n", - "hps.hop_fraction = [.5, .5, .125] \n", + "hps.sample_length = 1048576 if model == \"5b_lyrics\" else 786432\n", + "chunk_size = 16 if model == \"5b_lyrics\" else 32\n", + "max_batch_size = 3 if model == \"5b_lyrics\" else 16\n", + "hps.hop_fraction = [.5, .5, .125]\n", "hps.levels = 3\n", "\n", "vqvae, *priors = MODELS[model]\n", - "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length = hps.sample_length)), device)\n", + "vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=hps.sample_length)),\n", + " device)\n", "top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)" ], "execution_count": 0, @@ -425,27 +426,27 @@ }, "source": [ "total_sample_length_in_seconds = 120\n", - "metas = [dict(artist = \"Zac Brown Band\",\n", - " genre = \"Country\",\n", - " total_length = total_sample_length_in_seconds * hps.sr,\n", - " offset = 0,\n", - " lyrics = \"\"\"I met a traveller from an antique land,\n", - " Who said—“Two vast and trunkless legs of stone\n", - " Stand in the desert. . . . Near them, on the sand,\n", - " Half sunk a shattered visage lies, whose frown,\n", - " And wrinkled lip, and sneer of cold command,\n", - " Tell that its sculptor well those passions read\n", - " Which yet survive, stamped on these lifeless things,\n", - " The hand that mocked them, and the heart that fed;\n", - " And on the pedestal, these words appear:\n", - " My name is Ozymandias, King of Kings;\n", - " Look on my Works, ye Mighty, and despair!\n", - " Nothing beside remains. Round the decay\n", - " Of that colossal Wreck, boundless and bare\n", - " The lone and level sands stretch far away\n", - " \"\"\",\n", - " ),\n", - " ] * hps.n_samples\n", + "metas = [dict(artist=\"Zac Brown Band\",\n", + " genre=\"Country\",\n", + " total_length=total_sample_length_in_seconds * hps.sr,\n", + " offset=0,\n", + " lyrics=\"\"\"I met a traveller from an antique land,\n", + " Who said—“Two vast and trunkless legs of stone\n", + " Stand in the desert. . . . Near them, on the sand,\n", + " Half sunk a shattered visage lies, whose frown,\n", + " And wrinkled lip, and sneer of cold command,\n", + " Tell that its sculptor well those passions read\n", + " Which yet survive, stamped on these lifeless things,\n", + " The hand that mocked them, and the heart that fed;\n", + " And on the pedestal, these words appear:\n", + " My name is Ozymandias, King of Kings;\n", + " Look on my Works, ye Mighty, and despair!\n", + " Nothing beside remains. Round the decay\n", + " Of that colossal Wreck, boundless and bare\n", + " The lone and level sands stretch far away\n", + " \"\"\"\n", + " )\n", + " ] * hps.n_samples\n", "labels = top_prior.labeller.get_batch_labels(metas, 'cuda')" ], "execution_count": 0, @@ -489,7 +490,8 @@ }, "source": [ "initial_generation_in_seconds = 4\n", - "tokens_to_sample = seconds_to_tokens(initial_generation_in_seconds, hps.sr, top_prior, chunk_size)" + "tokens_to_sample = seconds_to_tokens(initial_generation_in_seconds, hps.sr,\n", + " top_prior, chunk_size)" ], "execution_count": 0, "outputs": [] @@ -527,8 +529,9 @@ "colab": {} }, "source": [ - "zs=[t.zeros(hps.n_samples,0,dtype=t.long, device='cuda') for _ in range(3)]\n", - "zs=sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior, tokens_to_sample, hps)\n", + "zs = [t.zeros(hps.n_samples, 0, dtype=t.long, device='cuda') for _ in range(3)]\n", + "zs = sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior,\n", + " tokens_to_sample, hps)\n", "x = vqvae.decode(zs[2:], start_level=2).cpu().numpy()" ], "execution_count": 0, @@ -555,7 +558,8 @@ }, "source": [ "for i in range(hps.n_samples):\n", - " librosa.output.write_wav(f'noisy_top_level_generation_{i}.wav', x[i], sr=44100)" + " librosa.output.write_wav(f'noisy_top_level_generation_{i}.wav', x[i],\n", + " sr=44100)" ], "execution_count": 0, "outputs": [] @@ -631,7 +635,7 @@ "colab": {} }, "source": [ - "my_choice=0" + "my_choice = 0" ], "execution_count": 0, "outputs": [] @@ -644,7 +648,7 @@ "colab": {} }, "source": [ - "zs[2]=zs[2][my_choice].repeat(hps.n_samples,1)\n", + "zs[2] = zs[2][my_choice].repeat(hps.n_samples, 1)\n", "t.save(zs, 'zs-checkpoint2.t')" ], "execution_count": 0, @@ -660,7 +664,7 @@ "source": [ "# Set to True to load the previous checkpoint:\n", "if False:\n", - " zs=t.load('zs-checkpoint2.t') " + " zs = t.load('zs-checkpoint2.t')" ], "execution_count": 0, "outputs": [] @@ -683,8 +687,9 @@ "colab": {} }, "source": [ - "continue_generation_in_seconds=4\n", - "tokens_to_sample = seconds_to_tokens(continue_generation_in_seconds, hps.sr, top_prior, chunk_size)" + "continue_generation_in_seconds = 4\n", + "tokens_to_sample = seconds_to_tokens(continue_generation_in_seconds, hps.sr,\n", + " top_prior, chunk_size)" ], "execution_count": 0, "outputs": [] @@ -707,7 +712,8 @@ "colab": {} }, "source": [ - "zs = sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior, tokens_to_sample, hps)\n", + "zs = sample_partial_window(zs, labels, sampling_kwargs, 2, top_prior,\n", + " tokens_to_sample, hps)\n", "x = vqvae.decode(zs[2:], start_level=2).cpu().numpy()" ], "execution_count": 0, @@ -720,7 +726,7 @@ "colab_type": "text" }, "source": [ - "Now listen to the longer versions of the sample you selected, and again choose a favorite sample. If you don't like any, return back to the cell where you can load the checkpoint, and continue again from there.\n", + "Now listen to the longer versions of the sample you selected, and again choose a favorite sample. If you don't like any, return to the cell where you can load the checkpoint, and continue again from there.\n", "\n", "When the samples start getting long, you might not always want to listen from the start, so change the playback start time later on if you like." ] @@ -733,7 +739,7 @@ "colab": {} }, "source": [ - "playback_start_time_in_seconds = 0 " + "playback_start_time_in_seconds = 0" ], "execution_count": 0, "outputs": [] @@ -747,7 +753,9 @@ }, "source": [ "for i in range(hps.n_samples):\n", - " librosa.output.write_wav(f'top_level_continuation_{i}.wav', x[i][playback_start_time_in_seconds*44100:], sr=44100)" + " librosa.output.write_wav(f'top_level_continuation_{i}.wav',\n", + " x[i][playback_start_time_in_seconds*44100:],\n", + " sr=44100)" ], "execution_count": 0, "outputs": [] @@ -818,7 +826,7 @@ "colab_type": "text" }, "source": [ - "Choose your favorite sample from your latest group of generations. (If you haven't already gone through the Co-Composition block, make sure to do that first so you have a generation to upsample)." + "Choose your favorite sample from your latest group of generations. (If you haven't already gone through the Co-Composition block, make sure to do that first, so you have a generation to upsample)." ] }, { @@ -830,9 +838,10 @@ }, "source": [ "choice = 0\n", - "select_best_sample = True # Set false if you want to upsample all your samples \n", - " # upsampling sometimes yields subtly different results on multiple runs,\n", - " # so this way you can choose your favorite upsampling" + "select_best_sample = True # Set false if you want to upsample all your samples\n", + " # upsampling sometimes yields subtly different\n", + " # results on multiple runs, so this way you can\n", + " # choose your favorite upsampling" ], "execution_count": 0, "outputs": [] @@ -846,7 +855,7 @@ }, "source": [ "if select_best_sample:\n", - " zs[2]=zs[2][choice].repeat(zs[2].shape[0],1)\n", + " zs[2] = zs[2][choice].repeat(zs[2].shape[0], 1)\n", "\n", "t.save(zs, 'zs-top-level-final.t')" ], @@ -874,7 +883,7 @@ "if False:\n", " zs = t.load('zs-top-level-final.t')\n", "\n", - "assert zs[2].shape[1]>=2048, f'Please first generate at least 2048 tokens at the top level, currently you have {zs[2].shape[1]}'\n", + "assert zs[2].shape[1] >= 2048, f'Please first generate at least 2048 tokens at the top level, currently you have {zs[2].shape[1]}'\n", "hps.sample_length = zs[2].shape[1]*top_prior.raw_to_tokens" ], "execution_count": 0, @@ -888,15 +897,17 @@ "colab": {} }, "source": [ - "# Set this False if you are on a local machine that has enough memory (this allows you to do the\n", - "# lyrics alignment visualization). For a hosted runtime, we'll need to go ahead and delete the top_prior\n", + "# Set this False if you are on a local machine that has enough memory\n", + "# (this allows you to do the lyrics alignment visualization).\n", + "# For a hosted runtime, we'll need to go ahead and delete the top_prior\n", "# if you are using the 5b_lyrics model.\n", "if True:\n", " del top_prior\n", " empty_cache()\n", - " top_prior=None\n", + " top_prior = None\n", "\n", - "upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu') for prior in priors[:-1]]" + "upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, 'cpu')\n", + " for prior in priors[:-1]]" ], "execution_count": 0, "outputs": [] @@ -910,11 +921,12 @@ }, "source": [ "sampling_kwargs = [dict(temp=.99, fp16=True, max_batch_size=16, chunk_size=32),\n", - " dict(temp=0.99, fp16=True, max_batch_size=16, chunk_size=32),\n", - " None]\n", + " dict(temp=.99, fp16=True, max_batch_size=16, chunk_size=32),\n", + " None]\n", "\n", - "if type(labels)==dict:\n", - " labels = [prior.labeller.get_batch_labels(metas, 'cuda') for prior in upsamplers] + [labels] " + "if isinstance(labels, dict):\n", + " labels = [prior.labeller.get_batch_labels(metas, 'cuda')\n", + " for prior in upsamplers] + [labels]" ], "execution_count": 0, "outputs": []