diff --git a/docs/quickstart.rst b/docs/quickstart.rst index e66a548..1940da2 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -1,46 +1,62 @@ -One-shot example -^^^^^^^^^^^^^^^^ +Examples +======== + +How To Import +------------- + +.. code-block:: python + + import kapre # to import the whole library + from kapre import ( # `time_frequency` layers can be directly imported from `kapre` + STFT, + InverseSTFT, + Magnitude, + Phase, + MagnitudeToDecibel, + ApplyFilterbank, + Delta, + ConcatenateFrequencyMap, + ) + from kapre import ( # `signal` layers can be also directly imported from kapre + Frame, + Energy, + MuLawEncoding, + MuLawDecoding, + LogmelToMFCC, + ) + # from kapre import backend # we can do this, but `backend` might be a too general name + import kapre.backend # for namespace sanity, you might prefer this + from kapre import backend as kapre_backend # or maybe this + from kapre.composed import ( # function names in `composed` are purposefully verbose. + get_stft_magnitude_layer, + get_melspectrogram_layer, + get_log_frequency_spectrogram_layer, + get_perfectly_reconstructing_stft_istft, + get_stft_mag_phase, + get_frequency_aware_conv2d, + ) + +Use STFT Magnitude +------------------ .. code-block:: python - from tensorflow.keras.models import Sequential - from tensorflow.keras.layers import Conv2D, BatchNormalization, ReLU, GlobalAveragePooling2D, Dense, Softmax - from kapre import STFT, Magnitude, MagnitudeToDecibel - from kapre.composed import get_melspectrogram_layer, get_log_frequency_spectrogram_layer, get_stft_magnitude_layer - - # 6 channels (!), maybe 1-sec audio signal, for an example. - input_shape = (6, 44100) - sr = 44100 - model = Sequential() - # A STFT layer - model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024, - window_fn=None, pad_end=False, - input_data_format='channels_last', output_data_format='channels_last', - input_shape=input_shape)) - model.add(Magnitude()) - model.add(MagnitudeToDecibel()) # these three layers can be replaced with get_stft_magnitude_layer() - # Alternatively, you may want to use a melspectrogram layer - # melgram_layer = get_melspectrogram_layer() - # or log-frequency layer - # log_stft_layer = get_log_frequency_spectrogram_layer() - - # add more layers as you want - model.add(Conv2D(32, (3, 3), strides=(2, 2))) - model.add(BatchNormalization()) - model.add(ReLU()) - model.add(GlobalAveragePooling2D()) - model.add(Dense(10)) - model.add(Softmax()) - - # Compile the model - model.compile('adam', 'categorical_crossentropy') # if single-label classification - - # train it with raw audio sample inputs - # for example, you may have functions that load your data as below. - x = load_x() # e.g., x.shape = (10000, 6, 44100) - y = load_y() # e.g., y.shape = (10000, 10) if it's 10-class classification - # then.. - model.fit(x, y) - # Done! +from tensorflow.keras.models import Sequential +from kapre import STFT, Magnitude, MagnitudeToDecibel +from kapre.composed import get_stft_magnitude_layer + +sampling_rate = 16000 # sampling rate of your input audio +duration = 20.0 # duration of the audio +num_channel = 2 # number of channels of the audio +input_shape = (num_channel, int(sampling_rate * duration)) # let's follow `channels_last` convention even for audio + +model = Sequential() +model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024, + window_name='hann_window', pad_end=False, + input_data_format='channels_last', output_data_format='channels_last', + input_shape=input_shape)) +model.add(Magnitude()) +model.add(MagnitudeToDecibel()) # these three layers can be replaced with get_stft_magnitude_layer() + * See the Jupyter notebook at the `example folder `_ diff --git a/docs/release_note.rst b/docs/release_note.rst index d63dce3..dc91b74 100644 --- a/docs/release_note.rst +++ b/docs/release_note.rst @@ -1,6 +1,11 @@ Release Note ^^^^^^^^^^^^ +* 29 Sep 2020 + - 0.3.4 + - Fix a bug in `kapre.backend.get_window_fn()`. Previously, it only correctly worked with `None` input and + an erorr was raised when non-default value was set for `window_name` in any layer. + * 15 Sep 2020 - 0.3.3 - `kapre.augmentation` is added diff --git a/kapre/__init__.py b/kapre/__init__.py index 9ef965b..c0c9b81 100644 --- a/kapre/__init__.py +++ b/kapre/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.3' +__version__ = '0.3.4' VERSION = __version__ from . import composed diff --git a/kapre/backend.py b/kapre/backend.py index 3023e16..ab64344 100644 --- a/kapre/backend.py +++ b/kapre/backend.py @@ -53,7 +53,7 @@ def get_window_fn(window_name=None): ) ) - return window_name + return available_windows[window_name] def validate_data_format_str(data_format): diff --git a/setup.py b/setup.py index 51121fc..5aba284 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='kapre', - version='0.3.3', + version='0.3.4', description='Kapre: Keras Audio Preprocessors. Tensorflow.Keras layers for audio pre-processing in deep learning', author='Keunwoo Choi', url='http://github.com/keunwoochoi/kapre/', diff --git a/tests/test_time_frequency.py b/tests/test_time_frequency.py index 8f0a56f..6c08c1a 100644 --- a/tests/test_time_frequency.py +++ b/tests/test_time_frequency.py @@ -106,6 +106,66 @@ def _get_stft_model(following_layer=None): allclose_phase(np.angle(S_complex), S) +@pytest.mark.parametrize('data_format', ['channels_first', 'channels_last']) +@pytest.mark.parametrize('window_name', [None, 'hann_window', 'hamming_window']) +def test_spectrogram_correctness_more(data_format, window_name): + def _get_stft_model(following_layer=None): + # compute with kapre + stft_model = tensorflow.keras.models.Sequential() + stft_model.add( + STFT( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_name=window_name, + pad_end=False, + input_data_format=data_format, + output_data_format=data_format, + input_shape=input_shape, + name='stft', + ) + ) + if following_layer is not None: + stft_model.add(following_layer) + return stft_model + + n_fft = 512 + hop_length = 256 + n_ch = 2 + + src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) + win_length = n_fft # test with x2 + # compute with librosa + S_ref = librosa.core.stft( + src_mono, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + center=False, + window=window_name.replace('_window', '') if window_name else 'hann', + ).T # (time, freq) + + S_ref = np.expand_dims(S_ref, axis=2) # time, freq, ch=1 + S_ref = np.tile(S_ref, [1, 1, n_ch]) # time, freq, ch=n_ch + if data_format == 'channels_first': + S_ref = np.transpose(S_ref, (2, 0, 1)) # ch, time, freq + + stft_model = _get_stft_model() + + S_complex = stft_model.predict(batch_src)[0] # 3d representation + allclose_complex_numbers(S_ref, S_complex) + + # test Magnitude() + stft_mag_model = _get_stft_model(Magnitude()) + S = stft_mag_model.predict(batch_src)[0] # 3d representation + np.testing.assert_allclose(np.abs(S_ref), S, atol=2e-4) + + # # test Phase() + stft_phase_model = _get_stft_model(Phase()) + S = stft_phase_model.predict(batch_src)[0] # 3d representation + allclose_phase(np.angle(S_complex), S) + + @pytest.mark.parametrize('n_fft', [512]) @pytest.mark.parametrize('sr', [22050]) @pytest.mark.parametrize('hop_length', [None, 256])