Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Observation = stft(observation)
num_samples = observation.shape[-1]
Y_mm = rearrange(Observation, 'd t f -> f t d')
t = Trainer()
affiliation = t.fit(
Y_mm,
num_classes=3,
iterations=iterations * 2,
weight_constant_axis=-1,
).predict(Y_mm)
pa = DHTVPermutationAlignment.from_stft_size(512)
affiliation_pa = pa(rearrange(affiliation, 'f k t -> k f t'))
affiliation_pa = rearrange(affiliation_pa, 'k f t -> k t f')
Speech_image_0_est, Speech_image_1_est, Noise_image_est = Observation[reference_channel, :, :] * affiliation_pa
speech_image_0_est = istft(Speech_image_0_est, num_samples=num_samples)
speech_image_1_est = istft(Speech_image_1_est, num_samples=num_samples)
noise_image_est = istft(Noise_image_est, num_samples=num_samples)
###########################################################################
# Calculate the metrics
speech_image = ex['audio_data']['speech_image']
noise_image = ex['audio_data']['noise_image']
speech_source = ex['audio_data']['speech_source']
Speech_image = stft(speech_image)
Noise_image = stft(noise_image)
else:
raise ValueError(postfilter)
Speech_prediction = apply_beamforming_vector(
vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
mix=rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
)
Speech_prediction = postfiler_fn(Speech_prediction)
speech_prediction = istft(rearrange(Speech_prediction, 'k f t -> k t f', k=K, t=T, f=F), num_samples=N)
if Speech_image is None:
speech_contribution = None
else:
Speech_contribution = apply_beamforming_vector(
vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
mix=rearrange(Speech_image, '(ksource k) d t f -> ksource k f d t', k=1, d=D, t=T, f=F),
)
Speech_contribution = postfiler_fn(Speech_contribution)
# ksource in [K-1, K]
speech_contribution = istft(rearrange(Speech_contribution, 'ksource k f t -> ksource k t f', k=K, t=T, f=F), num_samples=N)
if Noise_image is None:
noise_contribution = None
else:
Noise_contribution = apply_beamforming_vector(
vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
mix=rearrange(Noise_image, '(k d) t f -> k f d t', k=1, d=D, t=T, f=F),
)
Noise_contribution = postfiler_fn(Noise_contribution)
noise_contribution = istft(rearrange(Noise_contribution, 'k f t -> k t f', k=K, t=T, f=F), num_samples=N)
metric = OutputMetrics(
'mir_eval_sxr_sar': array([149.07223578, 147.06942287]),
'mir_eval_sxr_selection': array([0, 1]),
'invasive_sxr_sdr': array([12.32048218, 9.61471296]),
'invasive_sxr_sir': array([12.41346788, 9.69274082]),
'invasive_sxr_snr': array([29.06057363, 27.10901422])}
"""
_, N = speech_source.shape
K = mask.shape[-2]
D, T, F = Observation.shape
assert K < 10, (K, mask.shape, N, D, T, F)
assert D < 30, (K, N, D, T, F)
psds = get_power_spectral_density_matrix(
rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
rearrange(mask, 't k f -> f k t', k=K, t=T, f=F),
) # shape: f, ktarget, d, d
assert psds.shape == (F, K, D, D), (psds.shape, (F, K, D, D))
beamformers = list()
for k_target in range(K):
target_psd = psds[:, k_target]
distortion_psd = np.sum(np.delete(psds, k_target, axis=1), axis=1)
beamformers.append(
get_single_source_bf_vector(
bf_algorithm,
target_psd_matrix=target_psd,
noise_psd_matrix=distortion_psd,
)
def invasive_sxr(self):
from pb_bss.evaluation.sxr_module import output_sxr
invasive_sxr = output_sxr(
rearrange(
self.speech_contribution,
'sources targets samples -> sources targets samples'
)[:, self.selection, :],
rearrange(
self.noise_contribution, 'targets samples -> targets samples'
)[self.selection, :],
average_sources=False,
return_dict=True,
)
return invasive_sxr
speech_contribution = None
else:
Speech_contribution = apply_beamforming_vector(
vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
mix=rearrange(Speech_image, '(ksource k) d t f -> ksource k f d t', k=1, d=D, t=T, f=F),
)
Speech_contribution = postfiler_fn(Speech_contribution)
# ksource in [K-1, K]
speech_contribution = istft(rearrange(Speech_contribution, 'ksource k f t -> ksource k t f', k=K, t=T, f=F), num_samples=N)
if Noise_image is None:
noise_contribution = None
else:
Noise_contribution = apply_beamforming_vector(
vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
mix=rearrange(Noise_image, '(k d) t f -> k f d t', k=1, d=D, t=T, f=F),
)
Noise_contribution = postfiler_fn(Noise_contribution)
noise_contribution = istft(rearrange(Noise_contribution, 'k f t -> k t f', k=K, t=T, f=F), num_samples=N)
metric = OutputMetrics(
speech_prediction=speech_prediction,
speech_source=speech_source,
speech_contribution=speech_contribution,
noise_contribution=noise_contribution,
sample_rate=8000,
enable_si_sdr=False,
)
return metric
def stoi(self):
scores = pb_bss.evaluation.stoi(
reference=rearrange(
[self.speech_source] * self.channels,
'channels sources samples -> sources channels samples'
),
estimation=rearrange(
[self.observation] * self.K_source,
'sources channels samples -> sources channels samples'
),
sample_rate=self.sample_rate,
)
return scores
Rearrange("t h w c -> c t h w"),
Resize(args.frame_size),
Normalize(mean=mean, std=std),
])
# Take first clip from video only for now.
# Could be made to run on the full video.
dataset = VideoDataset(args.video, clip=32, transform=transform)
video = next(iter(dataset))
# video = torch.rand(3, 32, 128, 128)
assert video.size()[0:2] == (3, 32)
video = rearrange(video, "c t h w -> () c t h w")
video = video.data.cpu().numpy()
# Put video data into grapg leaf node with grads and on device
video = torch.tensor(video, requires_grad=True, device=device)
# regularizer term
variation = TotalVariationLoss()
denormalize = Denormalize(mean=mean, std=std)
progress = tqdm(range(args.num_epochs))
for epoch in progress:
loss = 0.
acts = model(video)
def _force_order(self, names):
s = ""
ex = []
for d in names:
if d not in self._schema._names:
ex.append(d)
s += " ()"
else:
ex.append(d)
s += " " + d
tensor = rearrange(self._tensor, "%s -> %s" % (self._to_einops(), s))
return self.__class__(tensor, ex)
Example:
>>> from IPython.lib.pretty import pprint
>>> ex = get_dataset('cv_dev93')[0]
>>> mask = get_mask_from_oracle(ex, 'IBM')
>>> metric, result = get_scores(ex, mask)
>>> pprint(result)
{'pesq': array([2.014, 1.78 ]),
'stoi': array([0.68236465, 0.61319396]),
'mir_eval_sxr_sdr': array([10.23933413, 10.01566298]),
'invasive_sxr_sdr': array([15.76439393, 13.86230425])}
"""
if Observation == 'Observation':
metric = get_multi_speaker_metrics(
mask=rearrange(mask, 'k t f -> t k f'), # T Ktarget F
Observation=ex['audio_data'][Observation], # D T F (stft signal)
speech_source=ex['audio_data']['speech_source'], # Ksource N (time signal)
Speech_image=ex['audio_data']['Speech_image'], # Ksource D T F (stft signal)
Noise_image=ex['audio_data']['Noise_image'], # D T F (stft signal)
istft=istft, # callable(signal, num_samples=num_samples)
bf_algorithm=beamformer,
postfilter=postfilter, # [None, 'mask_mul']
)
else:
assert mask is None, mask
assert beamformer == 'ch0', beamformer
assert postfilter is None, postfilter
metric = OutputMetrics(
speech_prediction=ex['audio_data'][Observation][:, 0],
speech_source=ex['audio_data']['speech_source'],
# speech_contribution=speech_contribution,
video.data += args.lr * grad
# Force video to [0, 1]; note: we are in normalized space
for i in range(video.size(1)):
cmin = (0. - mean[i]) / std[i]
cmax = (1. - mean[i]) / std[i]
video.data[0, i].clamp_(cmin, cmax)
video.grad.data.zero_()
progress.set_postfix({"loss": loss.item(), "tv": tv.item()})
# Once we have our dream, denormalize it,
# and turn it into sequence of PIL images.
video = rearrange(video, "() c t h w -> c t h w")
video = denormalize(video)
video = rearrange(video, "c t h w -> t h w c")
video.clamp_(0, 1)
video = video.data.cpu().numpy()
assert video.shape[0] == 32
assert video.shape[3] == 3
assert video.dtype == np.float32
assert (video >= 0).all()
assert (video <= 1).all()
video = (video * 255).astype(np.uint8)
images = [Image.fromarray(v, mode="RGB") for v in video]