data_io.py

##########################################################
# pytorch-kaldi v.0.1                                      
# Mirco Ravanelli, Titouan Parcollet
# Mila, University of Montreal
# October 2018
##########################################################

import numpy as np
import sys
from utils import compute_cw_max,dict_fea_lab_arch,is_sequential_dict
import os
import configparser
import re, gzip, struct

def load_dataset(fea_scp,fea_opts,lab_folder,lab_opts,left,right, max_sequence_length, output_folder, fea_only=False):
    def _input_is_wav_file(fea_scp):
        with open(fea_scp, 'r') as f:
            first_line = f.readline()
        ark_file = first_line.split(' ')[1].split(':')[0]
        with open(ark_file, 'rb') as f:
            first_ark_line = f.readline()
        return b'RIFF' in first_ark_line
    def _input_is_feature_file(fea_scp):
        return not _input_is_wav_file(fea_scp)
    def _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder, lab_opts, output_folder):
        fea = dict()
        lab = dict()
        if _input_is_feature_file(fea_scp):
            kaldi_bin="copy-feats"
            read_function = read_mat_ark
        elif _input_is_wav_file(fea_scp):
            kaldi_bin="wav-copy"
            read_function = read_vec_flt_ark
        fea = { k:m for k,m in read_function('ark:'+kaldi_bin+' scp:'+fea_scp+' ark:- |'+fea_opts,output_folder) }
        if not fea_only:
            lab = { k:v for k,v in read_vec_int_ark('gunzip -c '+lab_folder+'/ali*.gz | '+lab_opts+' '+lab_folder+'/final.mdl ark:- ark:-|',output_folder)  if k in fea} # Note that I'm copying only the aligments of the loaded fea
            fea = {k: v for k, v in fea.items() if k in lab} # This way I remove all the features without an aligment (see log file in alidir "Did not Succeded")
        return fea, lab
    def _chunk_features_and_labels(max_sequence_length, fea, lab, fea_only, input_is_wav):
        def _append_to_concat_list(fea_chunked, lab_chunked, fea_conc, lab_conc, name):
            for j in range(0, len(fea_chunked)):
                fea_conc.append(fea_chunked[j])
                lab_conc.append(lab_chunked[j])
                if len(fea_chunked) > 1:
                    snt_name.append(name+'_split'+str(j))
                else:
                    snt_name.append(k)
            return fea_conc, lab_conc
        def _chunk(max_sequence_length, fea, lab, fea_only):
            def _chunk_by_input_and_output_chunk_config(chunk_config, fea, lab, fea_only):
                ''' 
                If the sequence length is above the threshold, we split it with a minimal length max/4
                If max length = 500, then the split will start at 500 + (500/4) = 625. 
                A seq of length 625 will be splitted in one of 500 and one of 125
                '''
                chunk_size_fea, chunk_step_fea, chunk_size_lab, chunk_step_lab = chunk_config['chunk_size_fea'], chunk_config['chunk_step_fea'], chunk_config['chunk_size_lab'], chunk_config['chunk_step_lab']
                fea_chunked = list()
                lab_chunked = list()
                split_threshold_fea = chunk_size_fea + (chunk_size_fea/4)
                if(len(fea) > chunk_size_fea) and chunk_size_fea>0:
                    nr_of_chunks = (len(fea) + chunk_size_fea - 1) // chunk_size_fea
                    for i in range(nr_of_chunks):
                        chunk_start_fea = i * chunk_step_fea
                        if(len(fea[chunk_start_fea:]) > split_threshold_fea):
                            chunk_end_fea = chunk_start_fea + chunk_size_fea
                            fea_chunk = fea[chunk_start_fea:chunk_end_fea]
                            if not fea_only:
                                chunk_start_lab = i * chunk_step_lab
                                chunk_end_lab = chunk_start_lab + chunk_size_lab
                                lab_chunk = lab[chunk_start_lab:chunk_end_lab]
                            else:
                                lab_chunk = np.zeros((fea_chunk.shape[0],))
                            fea_chunked.append(fea_chunk)
                            lab_chunked.append(lab_chunk)
                        else:
                            fea_chunk = fea[chunk_start_fea:]
                            if not fea_only:
                                chunk_start_lab = i * chunk_step_lab
                                lab_chunk = lab[chunk_start_lab:]
                            else:
                                lab_chunk = np.zeros((fea_chunk.shape[0],))
                            lab_chunked.append(lab_chunk)
                            fea_chunked.append(fea_chunk)
                            break
                else:
                    fea_chunked.append(fea)
                    if not fea_only:
                      lab_chunked.append(lab)
                    else:
                      lab_chunked.append(np.zeros((fea.shape[0],)))
                return fea_chunked, lab_chunked
           
            chunk_config = dict()
            if type(max_sequence_length) == dict:
                chunk_config['chunk_size_fea'] = max_sequence_length['chunk_size_fea']
                chunk_config['chunk_step_fea'] = max_sequence_length['chunk_step_fea']
                chunk_config['chunk_size_lab'] = max_sequence_length['chunk_size_lab']
                chunk_config['chunk_step_lab'] = max_sequence_length['chunk_step_lab']
            elif type(max_sequence_length) == int:
                chunk_config['chunk_size_fea'] = max_sequence_length
                chunk_config['chunk_step_fea'] = max_sequence_length
                chunk_config['chunk_size_lab'] = max_sequence_length
                chunk_config['chunk_step_lab'] = max_sequence_length
            else:
                raise ValueError('Unknown type of max_sequence_length')
            return _chunk_by_input_and_output_chunk_config(chunk_config, fea, lab, fea_only)

        snt_name = list()
        fea_conc = list()
        lab_conc = list()
        feature_keys_soted_by_sequence_length = sorted(sorted(fea.keys()), key=lambda k: len(fea[k]))
        for k in feature_keys_soted_by_sequence_length:
            fea_el = fea[k]
            lab_el = None
            if not fea_only:
                lab_el = lab[k]
            fea_chunked, lab_chunked = _chunk(max_sequence_length, fea_el, lab_el, fea_only)
            fea_conc, lab_conc = _append_to_concat_list(fea_chunked, lab_chunked, fea_conc, lab_conc, k)
        return fea_conc, lab_conc, snt_name
    def _concatenate_features_and_labels(fea_conc, lab_conc):
        def _sort_chunks_by_length(fea_conc, lab_conc):
            fea_zipped = zip(fea_conc,lab_conc)
            fea_sorted = sorted(fea_zipped, key=lambda x: x[0].shape[0])
            fea_conc,lab_conc = zip(*fea_sorted)
            return fea_conc, lab_conc
        def _get_end_index_from_list(conc):
            end_snt=0
            end_index=list()
            for entry in conc:
                end_snt=end_snt+entry.shape[0]
                end_index.append(end_snt)
            return end_index

        fea_conc, lab_conc = _sort_chunks_by_length(fea_conc, lab_conc)
        end_index_fea = _get_end_index_from_list(fea_conc)
        end_index_lab = _get_end_index_from_list(lab_conc)
        fea_conc=np.concatenate(fea_conc)
        lab_conc=np.concatenate(lab_conc)
        return fea_conc, lab_conc, end_index_fea, end_index_lab
    def _match_feature_and_label_sequence_lengths(fea, lab, max_sequence_length):
        ALLOW_FRAME_DIFF_LARGER_ONE = False
        def _adjust_feature_sequence_length(fea, nr_of_fea_for_lab):
            nr_of_fea = fea.shape[0]
            if nr_of_fea > nr_of_fea_for_lab:
                fea_adj = np.take(fea, range(nr_of_fea_for_lab), axis=0)
            elif nr_of_fea < nr_of_fea_for_lab:
                padding = np.zeros(shape=(nr_of_fea_for_lab-nr_of_fea,) + fea.shape[1:])
                fea_adj = np.concatenate([fea, padding], axis=0)
            else:
                fea_adj = fea
            return fea_adj
        chunk_size_fea = max_sequence_length['chunk_size_fea']
        chunk_step_fea = max_sequence_length['chunk_step_fea']
        chunk_size_lab = max_sequence_length['chunk_size_lab']
        chunk_step_lab = max_sequence_length['chunk_step_lab']
        window_shift = max_sequence_length['window_shift']
        window_size = max_sequence_length['window_size']
        for k in fea.keys():
            nr_of_fea = fea[k].shape[0]
            nr_of_lab = lab[k].shape[0]
            nr_of_fea_for_lab = (nr_of_lab - 1) * window_shift + window_size
            if abs(nr_of_fea - nr_of_fea_for_lab) > window_shift and not ALLOW_FRAME_DIFF_LARGER_ONE:
               raise ValueError('Nr. of features: ' + str(nr_of_fea) + ' does not match nr. of labels: ' + str(nr_of_lab) + ' with expected nr. of features: ' + str(nr_of_fea_for_lab))
            fea[k] = _adjust_feature_sequence_length(fea[k], nr_of_fea_for_lab)
        return fea, lab

    fea, lab = _read_features_and_labels_with_kaldi(fea_scp, fea_opts, fea_only, lab_folder, lab_opts, output_folder)
    if _input_is_wav_file(fea_scp) and (not fea_only):
        fea, lab = _match_feature_and_label_sequence_lengths(fea, lab, max_sequence_length)
    fea_chunks, lab_chunks, chunk_names = _chunk_features_and_labels(max_sequence_length, fea, lab, fea_only, _input_is_wav_file(fea_scp))
    fea_conc, lab_conc, end_index_fea, end_index_lab = _concatenate_features_and_labels(fea_chunks, lab_chunks)
    return [chunk_names,fea_conc,lab_conc,np.asarray(end_index_fea),np.asarray(end_index_lab)] 


def context_window_old(fea,left,right):
 
 N_row=fea.shape[0]
 N_fea=fea.shape[1]
 frames = np.empty((N_row-left-right, N_fea*(left+right+1)))
 
 for frame_index in range(left,N_row-right):
  right_context=fea[frame_index+1:frame_index+right+1].flatten() # right context
  left_context=fea[frame_index-left:frame_index].flatten() # left context
  current_frame=np.concatenate([left_context,fea[frame_index],right_context])
  frames[frame_index-left]=current_frame

 return frames

def context_window(fea,left,right):
 
    N_elem=fea.shape[0]
    N_fea=fea.shape[1]
    
    fea_conc=np.empty([N_elem,N_fea*(left+right+1)])
    
    index_fea=0
    for lag in range(-left,right+1):
        fea_conc[:,index_fea:index_fea+fea.shape[1]]=np.roll(fea,lag,axis=0)
        index_fea=index_fea+fea.shape[1]
        
    fea_conc=fea_conc[left:fea_conc.shape[0]-right]
    
    return fea_conc


def load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,left,right,max_sequence_length, output_folder,fea_only=False):
  
  # open the file
  [data_name,data_set,data_lab,end_index_fea,end_index_lab]=load_dataset(fea_scp,fea_opts,lab_folder,lab_opts,left,right, max_sequence_length, output_folder, fea_only)

  # TODO: currently end_index_lab is ignored

  # Context window
  if left!=0 or right!=0:
      data_set=context_window(data_set,left,right)

  end_index_fea=end_index_fea-left
  end_index_fea[-1]=end_index_fea[-1]-right

  # mean and variance normalization
  data_set=(data_set-np.mean(data_set,axis=0))/np.std(data_set,axis=0)

  # Label processing
  data_lab=data_lab-data_lab.min()
  if right>0:
    data_lab=data_lab[left:-right]
  else:
    data_lab=data_lab[left:]   
  
  data_set=np.column_stack((data_set, data_lab))

  return [data_name,data_set,end_index_fea]

def load_counts(class_counts_file):
    with open(class_counts_file) as f:
        row = next(f).strip().strip('[]').strip()
        counts = np.array([ np.float32(v) for v in row.split() ])
    return counts 

def read_lab_fea_refac01(cfg_file, fea_only, shared_list, output_folder):
    def _read_chunk_specific_config(cfg_file):
        if not(os.path.exists(cfg_file)):
            sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
            sys.exit(0)
        else:
            config = configparser.ConfigParser()
            config.read(cfg_file)
        return config
    def _read_from_config(config, fea_only):
        def _get_max_seq_length_from_config_str(config_str):
            max_seq_length=[int(e) for e in config_str.split(',')]
            if len(max_seq_length) == 1:
                max_seq_length = max_seq_length[0]
            else:
                assert len(max_seq_length) == 6
                max_seq_length_list = max_seq_length
                max_seq_length = dict()
                max_seq_length['chunk_size_fea'] = max_seq_length_list[0]
                max_seq_length['chunk_step_fea'] = max_seq_length_list[1]
                max_seq_length['chunk_size_lab'] = max_seq_length_list[2]
                max_seq_length['chunk_step_lab'] = max_seq_length_list[3]
                max_seq_length['window_shift'] = max_seq_length_list[4]
                max_seq_length['window_size'] = max_seq_length_list[5]
            return max_seq_length
        
        to_do=config['exp']['to_do']
        if to_do=='train':
            max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_train'])
        if to_do=='valid':
            max_seq_length=_get_max_seq_length_from_config_str(config['batches']['max_seq_length_valid'])
        if to_do=='forward':
            max_seq_length=-1 # do to break forward sentences
            fea_only=True
        fea_dict, lab_dict, arch_dict = dict_fea_lab_arch(config, fea_only)
        seq_model = is_sequential_dict(config, arch_dict)
        return to_do, max_seq_length, fea_dict, lab_dict, arch_dict, seq_model
    def _read_features_and_labels(fea_dict, lab_dict, max_seq_length, fea_only, output_folder):
        def _get_fea_config_from_dict(fea_dict_entr):
            fea_scp = fea_dict_entr[1]
            fea_opts = fea_dict_entr[2]
            cw_left = int(fea_dict_entr[3])
            cw_right = int(fea_dict_entr[4])
            return fea_scp, fea_opts, cw_left, cw_right
        def _get_lab_config_from_dict(lab_dict_entr, fea_only):
            if fea_only:
                lab_folder = None 
                lab_opts = None
            else:
                lab_folder = lab_dict_entr[1]
                lab_opts = lab_dict_entr[2]
            return lab_folder, lab_opts
        def _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab):
            data_set_lab = np.take(data_set_lab, range(cw_left_max-cw_left,data_set_lab.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip')
            data_set_fea = np.take(data_set_fea, range(cw_left_max-cw_left,data_set_fea.shape[0]-(cw_right_max-cw_right)), axis=0, mode='clip')
            data_end_index_fea = data_end_index_fea - (cw_left_max - cw_left)
            data_end_index_lab = data_end_index_lab - (cw_left_max - cw_left)
            data_end_index_fea[-1] = data_end_index_fea[-1] - (cw_right_max - cw_right)
            data_end_index_lab[-1] = data_end_index_lab[-1] - (cw_right_max - cw_right)
            return data_set_lab, data_set_fea, data_end_index_fea, data_end_index_lab
        def _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, labs_fea, cnt_fea, cnt_lab):
            if cnt_fea==0 and cnt_lab==0:
                data_set=data_set_fea
                labs=labs_fea
                fea_dict[fea].append(fea_index)
                fea_index=fea_index+data_set_fea.shape[1]
                fea_dict[fea].append(fea_index)
                fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
            elif cnt_fea==0 and (not cnt_lab==0):
                labs=np.column_stack((labs,labs_fea))
            elif (not cnt_fea==0) and cnt_lab==0:
                data_set=np.column_stack((data_set,data_set_fea))
                fea_dict[fea].append(fea_index)
                fea_index=fea_index+data_set_fea.shape[1]
                fea_dict[fea].append(fea_index)
                fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
            return data_set, labs, fea_dict, fea_index
        def _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab):
            if not (data_name == data_name_fea):
                sys.stderr.write('ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n')
                sys.exit(0)
            if not (data_end_index_fea_ini == data_end_index_fea).all():
                sys.stderr.write('ERROR end_index must be the same for all the sentences"\n')
                sys.exit(0)
            if not (data_end_index_lab_ini == data_end_index_lab).all():
                sys.stderr.write('ERROR end_index must be the same for all the sentences"\n')
                sys.exit(0)
        def _update_lab_dict(lab_dict, data_set):
            cnt_lab=0
            for lab in lab_dict.keys():
                lab_dict[lab].append(data_set.shape[1]+cnt_lab)
                cnt_lab=cnt_lab+1
            return lab_dict
        def _load_chunk_refac01(fea_scp,fea_opts,lab_folder,lab_opts,left,right,max_sequence_length, output_folder,fea_only=False):
            [data_name,data_set,data_lab,end_index_fea,end_index_lab]=load_dataset(fea_scp,fea_opts,lab_folder,lab_opts,left,right, max_sequence_length, output_folder, fea_only)
            # TODO: this function will currently only work well if no context window is given or fea and lab have the same time dimensionality
            # Context window
            if left!=0 or right!=0:
                data_set=context_window(data_set,left,right)
            end_index_fea = end_index_fea - left
            end_index_lab = end_index_lab - left
            end_index_fea[-1] = end_index_fea[-1] - right
            end_index_lab[-1] = end_index_lab[-1] - right
            # mean and variance normalization
            data_set=(data_set-np.mean(data_set,axis=0))/np.std(data_set,axis=0)
            # Label processing
            data_lab=data_lab-data_lab.min()
            if right>0:
                data_lab=data_lab[left:-right]
            else:
                data_lab=data_lab[left:]   
            if len(data_set.shape) == 1:
                data_set = np.expand_dims(data_set, -1)
            return [data_name, data_set, data_lab, end_index_fea, end_index_lab]
        
        cw_left_max, cw_right_max = compute_cw_max(fea_dict)
        fea_index=0
        cnt_fea=0
        data_name = None 
        data_end_index_fea_ini = None 
        data_end_index_lab_ini = None 
        data_set = None
        labs = None
        for fea in fea_dict.keys():
            fea_scp, fea_opts, cw_left, cw_right = _get_fea_config_from_dict(fea_dict[fea])
            cnt_lab=0
            if fea_only:
                lab_dict.update({'lab_name':'none'})
            for lab in lab_dict.keys():
                lab_folder, lab_opts = _get_lab_config_from_dict(lab_dict[lab], fea_only)
                data_name_fea, data_set_fea, data_set_lab, data_end_index_fea, data_end_index_lab = _load_chunk_refac01(fea_scp, fea_opts, lab_folder, lab_opts, cw_left, cw_right, max_seq_length, output_folder, fea_only)
                if sum([abs(e) for e in [cw_left_max, cw_right_max, cw_left, cw_right]]) != 0: 
                    data_set_lab, data_set_fea, data_end_index_fea, data_end_index_lab = _compensate_for_different_context_windows(data_set_fea, data_set_lab, cw_left_max, cw_left, cw_right_max, cw_right, data_end_index_fea, data_end_index_lab)
                if cnt_fea == 0 and cnt_lab == 0:
                    data_end_index_fea_ini = data_end_index_fea
                    data_end_index_lab_ini = data_end_index_lab
                    data_name = data_name_fea
                data_set, labs, fea_dict, fea_index = _update_data(data_set, labs, fea_dict, fea, fea_index, data_set_fea, data_set_lab, cnt_fea, cnt_lab)
                _check_consistency(data_name, data_name_fea, data_end_index_fea_ini, data_end_index_fea, data_end_index_lab_ini, data_end_index_lab)
                cnt_lab=cnt_lab+1
            cnt_fea=cnt_fea+1
        if not fea_only:
            lab_dict = _update_lab_dict(lab_dict, data_set)
        return data_name, data_end_index_fea_ini, data_end_index_lab_ini, fea_dict, lab_dict, data_set, labs
    def _reorder_data_set(data_set, labs, seq_model, to_do):
        if not(seq_model) and to_do != 'forward' and (data_set.shape[0] == labs.shape[0]):
            data_set_shape = data_set.shape[1]
            data_set_joint = np.column_stack((data_set,labs))
            np.random.shuffle(data_set)
            data_set = data_set_joint[:, :data_set_shape]
            labs = np.squeeze(data_set_joint[:, data_set_shape:], axis=-1)
        return data_set, labs
    def _append_to_shared_list(shared_list, data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set):
        shared_list.append(data_name)
        shared_list.append(data_end_index_fea)
        shared_list.append(data_end_index_lab)
        shared_list.append(fea_dict)
        shared_list.append(lab_dict)
        shared_list.append(arch_dict)
        shared_list.append(data_set)
        return shared_list

    config = _read_chunk_specific_config(cfg_file)
    to_do, max_seq_length, fea_dict, lab_dict, arch_dict, seq_model = _read_from_config(config, fea_only)
    data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, data_set, labs = _read_features_and_labels(fea_dict, lab_dict, max_seq_length, fea_only, output_folder)
    data_set, labs = _reorder_data_set(data_set, labs, seq_model, to_do)
    data_set = {'input': data_set, 'ref': labs}
    shared_list = _append_to_shared_list(shared_list, data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set)

def read_lab_fea(cfg_file,fea_only,shared_list,output_folder):
    
    # Reading chunk-specific cfg file (first argument-mandatory file) 
    if not(os.path.exists(cfg_file)):
         sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
         sys.exit(0)
    else:
        config = configparser.ConfigParser()
        config.read(cfg_file)
        
    
    # Reading some cfg parameters
    to_do=config['exp']['to_do']
    
    if to_do=='train':
        max_seq_length=int(config['batches']['max_seq_length_train']) #*(int(info_file[-13:-10])+1) # increasing over the epochs

    if to_do=='valid':
        max_seq_length=int(config['batches']['max_seq_length_valid'])

    if to_do=='forward':
        max_seq_length=-1 # do to break forward sentences
    
    [fea_dict,lab_dict,arch_dict]=dict_fea_lab_arch(config,fea_only)
    [cw_left_max,cw_right_max]=compute_cw_max(fea_dict)
    
    fea_index=0
    cnt_fea=0
    for fea in fea_dict.keys():
        
        # reading the features
        fea_scp=fea_dict[fea][1]
        fea_opts=fea_dict[fea][2]
        cw_left=int(fea_dict[fea][3])
        cw_right=int(fea_dict[fea][4])
        
        cnt_lab=0

        # Production case, we don't have labels (lab_name = none)
        if fea_only:
          lab_dict.update({'lab_name':'none'})
        for lab in lab_dict.keys():
            # Production case, we don't have labels (lab_name = none)
            if fea_only:
              lab_folder=None 
              lab_opts=None
            else:
              lab_folder=lab_dict[lab][1]
              lab_opts=lab_dict[lab][2]
    
            [data_name_fea,data_set_fea,data_end_index_fea]=load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,cw_left,cw_right,max_seq_length, output_folder, fea_only)
    
            
            # making the same dimenion for all the features (compensating for different context windows)
            labs_fea=data_set_fea[cw_left_max-cw_left:data_set_fea.shape[0]-(cw_right_max-cw_right),-1]
            data_set_fea=data_set_fea[cw_left_max-cw_left:data_set_fea.shape[0]-(cw_right_max-cw_right),0:-1]
            data_end_index_fea=data_end_index_fea-(cw_left_max-cw_left)
            data_end_index_fea[-1]=data_end_index_fea[-1]-(cw_right_max-cw_right)
    
            
            if cnt_fea==0 and cnt_lab==0:
                data_set=data_set_fea
                labs=labs_fea
                data_end_index=data_end_index_fea
                data_end_index=data_end_index_fea
                data_name=data_name_fea
                
                fea_dict[fea].append(fea_index)
                fea_index=fea_index+data_set_fea.shape[1]
                fea_dict[fea].append(fea_index)
                fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
                
                
            else:
                if cnt_fea==0:
                    labs=np.column_stack((labs,labs_fea))
                
                if cnt_lab==0:
                    data_set=np.column_stack((data_set,data_set_fea))
                    fea_dict[fea].append(fea_index)
                    fea_index=fea_index+data_set_fea.shape[1]
                    fea_dict[fea].append(fea_index)
                    fea_dict[fea].append(fea_dict[fea][6]-fea_dict[fea][5])
                
                
                # Checks if lab_names are the same for all the features
                if not(data_name==data_name_fea):
                    sys.stderr.write('ERROR: different sentence ids are detected for the different features. Plase check again input feature lists"\n')
                    sys.exit(0)
                
                # Checks if end indexes are the same for all the features
                if not(data_end_index==data_end_index_fea).all():
                    sys.stderr.write('ERROR end_index must be the same for all the sentences"\n')
                    sys.exit(0)
                    
            cnt_lab=cnt_lab+1
    
    
        cnt_fea=cnt_fea+1
        
    cnt_lab=0
    if not fea_only:   
      for lab in lab_dict.keys():
          lab_dict[lab].append(data_set.shape[1]+cnt_lab)
          cnt_lab=cnt_lab+1
           
    data_set=np.column_stack((data_set,labs))
    
    # check automatically if the model is sequential
    seq_model=is_sequential_dict(config,arch_dict)
    
    # Randomize if the model is not sequential
    if not(seq_model) and to_do!='forward':
        np.random.shuffle(data_set)
     
    # Split dataset in many part. If the dataset is too big, we can have issues to copy it into the shared memory (due to pickle limits)
    #N_split=10
    #data_set=np.array_split(data_set, N_split)
    
    # Adding all the elements in the shared list    
    shared_list.append(data_name)
    shared_list.append(data_end_index)
    shared_list.append(fea_dict)
    shared_list.append(lab_dict)
    shared_list.append(arch_dict)
    shared_list.append(data_set)
    

# The following libraries are copied from kaldi-io-for-python project (https://github.com/vesis84/kaldi-io-for-python)
    
# Copyright 2014-2016  Brno University of Technology (author: Karel Vesely)
# Licensed under the Apache License, Version 2.0 (the "License")
    
#################################################
# Define all custom exceptions,
class UnsupportedDataType(Exception): pass
class UnknownVectorHeader(Exception): pass
class UnknownMatrixHeader(Exception): pass

class BadSampleSize(Exception): pass
class BadInputFormat(Exception): pass

class SubprocessFailed(Exception): pass

#################################################
# Data-type independent helper functions,

def open_or_fd(file, output_folder,mode='rb'):
  """ fd = open_or_fd(file)
   Open file, gzipped file, pipe, or forward the file-descriptor.
   Eventually seeks in the 'file' argument contains ':offset' suffix.
  """
  offset = None

  try:
    # strip 'ark:' prefix from r{x,w}filename (optional),
    if re.search('^(ark|scp)(,scp|,b|,t|,n?f|,n?p|,b?o|,n?s|,n?cs)*:', file):
      (prefix,file) = file.split(':',1)
    # separate offset from filename (optional),
    if re.search(':[0-9]+$', file):
      (file,offset) = file.rsplit(':',1)
    # input pipe?
    if file[-1] == '|':
      fd = popen(file[:-1], output_folder,'rb') # custom,
    # output pipe?
    elif file[0] == '|':
      fd = popen(file[1:], output_folder,'wb') # custom,
    # is it gzipped?
    elif file.split('.')[-1] == 'gz':
      fd = gzip.open(file, mode)
    # a normal file...
    else:
      fd = open(file, mode)
  except TypeError:
    # 'file' is opened file descriptor,
    fd = file
  # Eventually seek to offset,
  if offset != None: fd.seek(int(offset))
  
  return fd

# based on '/usr/local/lib/python3.4/os.py'
def popen(cmd, output_folder,mode="rb"):
  if not isinstance(cmd, str):
    raise TypeError("invalid cmd type (%s, expected string)" % type(cmd))

  import subprocess, io, threading

  # cleanup function for subprocesses,
  def cleanup(proc, cmd):
    ret = proc.wait()
    if ret > 0:
      raise SubprocessFailed('cmd %s returned %d !' % (cmd,ret))
    return

  # text-mode,
  if mode == "r":
    err=open(output_folder+'/log.log',"a")
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,stderr=err)
    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
    return io.TextIOWrapper(proc.stdout)
  elif mode == "w":
    err=open(output_folder+'/log.log',"a")
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,stderr=err)
    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
    return io.TextIOWrapper(proc.stdin)
  # binary,
  elif mode == "rb":
    err=open(output_folder+'/log.log',"a")
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,stderr=err)
    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
    return proc.stdout
  elif mode == "wb":
    err=open(output_folder+'/log.log',"a")
    proc = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,stderr=err)
    threading.Thread(target=cleanup,args=(proc,cmd)).start() # clean-up thread,
    return proc.stdin
  # sanity,
  else:
    raise ValueError("invalid mode %s" % mode)


def read_key(fd):
  """ [key] = read_key(fd)
   Read the utterance-key from the opened ark/stream descriptor 'fd'.
  """
  key = ''
  while 1:
    char = fd.read(1).decode("latin1")
    if char == '' : break
    if char == ' ' : break
    key += char
  key = key.strip()
  if key == '': return None # end of file,
  assert(re.match('^\S+$',key) != None) # check format (no whitespace!)
  return key


#################################################
# Integer vectors (alignments, ...),

def read_ali_ark(file_or_fd,output_folder):
  """ Alias to 'read_vec_int_ark()' """
  return read_vec_int_ark(file_or_fd,output_folder)

def read_vec_int_ark(file_or_fd,output_folder):
  """ generator(key,vec) = read_vec_int_ark(file_or_fd)
   Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.

   Read ark to a 'dictionary':
   d = { u:d for u,d in kaldi_io.read_vec_int_ark(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    key = read_key(fd)
    while key:
      ali = read_vec_int(fd,output_folder)
      yield key, ali
      key = read_key(fd)
  finally:
    if fd is not file_or_fd: fd.close()

def read_vec_int(file_or_fd,output_folder):
  """ [int-vec] = read_vec_int(file_or_fd)
   Read kaldi integer vector, ascii or binary input,
  """
  fd = open_or_fd(file_or_fd,output_folder)
  binary = fd.read(2).decode()
  if binary == '\0B': # binary flag
    assert(fd.read(1).decode() == '\4'); # int-size
    vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
    if vec_size == 0:
      return np.array([], dtype='int32')
    # Elements from int32 vector are sored in tuples: (sizeof(int32), value),
    vec = np.frombuffer(fd.read(vec_size*5), dtype=[('size','int8'),('value','int32')], count=vec_size)
    assert(vec[0]['size'] == 4) # int32 size,
    ans = vec[:]['value'] # values are in 2nd column,
  else: # ascii,
    arr = (binary + fd.readline().decode()).strip().split()
    try:
      arr.remove('['); arr.remove(']') # optionally
    except ValueError:
      pass
    ans = np.array(arr, dtype=int)
  if fd is not file_or_fd : fd.close() # cleanup
  return ans

# Writing,
def write_vec_int(file_or_fd, output_folder, v, key=''):
  """ write_vec_int(f, v, key='')
   Write a binary kaldi integer vector to filename or stream.
   Arguments:
   file_or_fd : filename or opened file descriptor for writing,
   v : the vector to be stored,
   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.

   Example of writing single vector:
   kaldi_io.write_vec_int(filename, vec)

   Example of writing arkfile:
   with open(ark_file,'w') as f:
     for key,vec in dict.iteritems():
       kaldi_io.write_vec_flt(f, vec, key=key)
  """
  fd = open_or_fd(file_or_fd, output_folder, mode='wb')
  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
  try:
    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
    fd.write('\0B'.encode()) # we write binary!
    # dim,
    fd.write('\4'.encode()) # int32 type,
    fd.write(struct.pack(np.dtype('int32').char, v.shape[0]))
    # data,
    for i in range(len(v)):
      fd.write('\4'.encode()) # int32 type,
      fd.write(struct.pack(np.dtype('int32').char, v[i])) # binary,
  finally:
    if fd is not file_or_fd : fd.close()


#################################################
# Float vectors (confidences, ivectors, ...),

# Reading,
def read_vec_flt_scp(file_or_fd,output_folder):
  """ generator(key,mat) = read_vec_flt_scp(file_or_fd)
   Returns generator of (key,vector) tuples, read according to kaldi scp.
   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.

   Iterate the scp:
   for key,vec in kaldi_io.read_vec_flt_scp(file):
     ...

   Read scp to a 'dictionary':
   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    for line in fd:
      (key,rxfile) = line.decode().split(' ')
      vec = read_vec_flt(rxfile,output_folder)
      yield key, vec
  finally:
    if fd is not file_or_fd : fd.close()

def read_vec_flt_ark(file_or_fd,output_folder):
  """ generator(key,vec) = read_vec_flt_ark(file_or_fd)
   Create generator of (key,vector<float>) tuples, reading from an ark file/stream.
   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.

   Read ark to a 'dictionary':
   d = { u:d for u,d in kaldi_io.read_vec_flt_ark(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    key = read_key(fd)
    while key:
      ali = read_vec_flt(fd,output_folder)
      yield key, ali
      key = read_key(fd)
  finally:
    if fd is not file_or_fd: fd.close()

def read_vec_flt(file_or_fd,output_folder):
  """ [flt-vec] = read_vec_flt(file_or_fd)
   Read kaldi float vector, ascii or binary input,
  """
  fd = open_or_fd(file_or_fd,output_folder)
  binary = fd.read(2).decode()
  if binary == '\0B': # binary flag
    return _read_vec_flt_binary(fd)
  elif binary == 'RI':
    return _read_vec_flt_riff(fd)
  else:  # ascii,
    arr = (binary + fd.readline().decode()).strip().split()
    try:
      arr.remove('['); arr.remove(']') # optionally
    except ValueError:
      pass
    ans = np.array(arr, dtype=float)
  if fd is not file_or_fd : fd.close() # cleanup
  return ans

def _read_vec_flt_riff(fd):
    RIFF_CHUNK_DESCR_HEADER_SIZE = 12
    ALREADY_READ_HEADER_BYTES = 2
    SUB_CHUNK_HEADER_SIZE = 8
    DATA_CHUNK_HEADER_SIZE = 8
    def pcm2float(signal, dtype='float32'):
        signal = np.asarray(signal)
        dtype = np.dtype(dtype)
        return signal.astype(dtype) / dtype.type(-np.iinfo(signal.dtype).min)

    import struct
    header = fd.read(RIFF_CHUNK_DESCR_HEADER_SIZE - ALREADY_READ_HEADER_BYTES)
    assert header[:2] == b'FF'
    chunk_header = fd.read(SUB_CHUNK_HEADER_SIZE)
    subchunk_id, subchunk_size = struct.unpack('<4sI', chunk_header)
    aformat, channels, samplerate, byterate, block_align, bps = struct.unpack('HHIIHH', fd.read(subchunk_size))
    subchunk2_id, subchunk2_size = struct.unpack('<4sI', fd.read(DATA_CHUNK_HEADER_SIZE))
    pcm_data = np.frombuffer(fd.read(subchunk2_size), dtype='int' + str(bps))
    return pcm2float(pcm_data)

def _read_vec_flt_binary(fd):
  header = fd.read(3).decode()
  if header == 'FV ' : sample_size = 4 # floats
  elif header == 'DV ' : sample_size = 8 # doubles
  else : raise UnknownVectorHeader("The header contained '%s'" % header)
  assert (sample_size > 0)
  # Dimension,
  assert (fd.read(1).decode() == '\4'); # int-size
  vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # vector dim
  if vec_size == 0:
    return np.array([], dtype='float32')
  # Read whole vector,
  buf = fd.read(vec_size * sample_size)
  if sample_size == 4 : ans = np.frombuffer(buf, dtype='float32')
  elif sample_size == 8 : ans = np.frombuffer(buf, dtype='float64')
  else : raise BadSampleSize
  return ans

# Writing,
def write_vec_flt(file_or_fd, output_folder, v, key=''):
  """ write_vec_flt(f, v, key='')
   Write a binary kaldi vector to filename or stream. Supports 32bit and 64bit floats.
   Arguments:
   file_or_fd : filename or opened file descriptor for writing,
   v : the vector to be stored,
   key (optional) : used for writing ark-file, the utterance-id gets written before the vector.

   Example of writing single vector:
   kaldi_io.write_vec_flt(filename, vec)

   Example of writing arkfile:
   with open(ark_file,'w') as f:
     for key,vec in dict.iteritems():
       kaldi_io.write_vec_flt(f, vec, key=key)
  """
  fd = open_or_fd(file_or_fd,output_folder, mode='wb')
  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
  try:
    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
    fd.write('\0B'.encode()) # we write binary!
    # Data-type,
    if v.dtype == 'float32': fd.write('FV '.encode())
    elif v.dtype == 'float64': fd.write('DV '.encode())
    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % v.dtype)
    # Dim,
    fd.write('\04'.encode())
    fd.write(struct.pack(np.dtype('uint32').char, v.shape[0])) # dim
    # Data,
    fd.write(v.tobytes())
  finally:
    if fd is not file_or_fd : fd.close()


#################################################
# Float matrices (features, transformations, ...),

# Reading,
def read_mat_scp(file_or_fd,output_folder):
  """ generator(key,mat) = read_mat_scp(file_or_fd)
   Returns generator of (key,matrix) tuples, read according to kaldi scp.
   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.

   Iterate the scp:
   for key,mat in kaldi_io.read_mat_scp(file):
     ...

   Read scp to a 'dictionary':
   d = { key:mat for key,mat in kaldi_io.read_mat_scp(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    for line in fd:
      (key,rxfile) = line.decode().split(' ')
      mat = read_mat(rxfile,output_folder)
      yield key, mat
  finally:
    if fd is not file_or_fd : fd.close()

def read_mat_ark(file_or_fd,output_folder):
  """ generator(key,mat) = read_mat_ark(file_or_fd)
   Returns generator of (key,matrix) tuples, read from ark file/stream.
   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.

   Iterate the ark:
   for key,mat in kaldi_io.read_mat_ark(file):
     ...

   Read ark to a 'dictionary':
   d = { key:mat for key,mat in kaldi_io.read_mat_ark(file) }
  """

  fd = open_or_fd(file_or_fd,output_folder)
  try:
    key = read_key(fd)
    while key:
      mat = read_mat(fd,output_folder)
      yield key, mat
      key = read_key(fd)   
  finally:
    if fd is not file_or_fd : fd.close()
  

def read_mat(file_or_fd,output_folder):
  """ [mat] = read_mat(file_or_fd)
   Reads single kaldi matrix, supports ascii and binary.
   file_or_fd : file, gzipped file, pipe or opened file descriptor.
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    binary = fd.read(2).decode()
    if binary == '\0B' :
      mat = _read_mat_binary(fd)
    else:
      assert(binary == ' [')
      mat = _read_mat_ascii(fd)
  finally:
    if fd is not file_or_fd: fd.close()
  return mat

def _read_mat_binary(fd):
  # Data type
  header = fd.read(3).decode()
  # 'CM', 'CM2', 'CM3' are possible values,
  if header.startswith('CM'): return _read_compressed_mat(fd, header)
  elif header == 'FM ': sample_size = 4 # floats
  elif header == 'DM ': sample_size = 8 # doubles
  else: raise UnknownMatrixHeader("The header contained '%s'" % header)
  assert(sample_size > 0)
  # Dimensions
  s1, rows, s2, cols = np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0]
  # Read whole matrix
  buf = fd.read(rows * cols * sample_size)
  if sample_size == 4 : vec = np.frombuffer(buf, dtype='float32')
  elif sample_size == 8 : vec = np.frombuffer(buf, dtype='float64')
  else : raise BadSampleSize
  mat = np.reshape(vec,(rows,cols))
  return mat

def _read_mat_ascii(fd):
  rows = []
  while 1:
    line = fd.readline().decode()
    if (len(line) == 0) : raise BadInputFormat # eof, should not happen!
    if len(line.strip()) == 0 : continue # skip empty line
    arr = line.strip().split()
    if arr[-1] != ']':
      rows.append(np.array(arr,dtype='float32')) # not last line
    else:
      rows.append(np.array(arr[:-1],dtype='float32')) # last line
      mat = np.vstack(rows)
      return mat


def _read_compressed_mat(fd, format):
  """ Read a compressed matrix,
      see: https://github.com/kaldi-asr/kaldi/blob/master/src/matrix/compressed-matrix.h
      methods: CompressedMatrix::Read(...), CompressedMatrix::CopyToMat(...),
  """
  assert(format == 'CM ') # The formats CM2, CM3 are not supported...

  # Format of header 'struct',
  global_header = np.dtype([('minvalue','float32'),('range','float32'),('num_rows','int32'),('num_cols','int32')]) # member '.format' is not written,
  per_col_header = np.dtype([('percentile_0','uint16'),('percentile_25','uint16'),('percentile_75','uint16'),('percentile_100','uint16')])

  # Read global header,
  globmin, globrange, rows, cols = np.frombuffer(fd.read(16), dtype=global_header, count=1)[0]

  # The data is structed as [Colheader, ... , Colheader, Data, Data , .... ]
  #                         {           cols           }{     size         }
  col_headers = np.frombuffer(fd.read(cols*8), dtype=per_col_header, count=cols)
  col_headers = np.array([np.array([x for x in y]) * globrange * 1.52590218966964e-05 + globmin for y in col_headers], dtype=np.float32)
  data = np.reshape(np.frombuffer(fd.read(cols*rows), dtype='uint8', count=cols*rows), newshape=(cols,rows)) # stored as col-major,

  mat = np.zeros((cols,rows), dtype='float32')
  p0 = col_headers[:, 0].reshape(-1, 1)
  p25 = col_headers[:, 1].reshape(-1, 1)
  p75 = col_headers[:, 2].reshape(-1, 1)
  p100 = col_headers[:, 3].reshape(-1, 1)
  mask_0_64 = (data <= 64)
  mask_193_255 = (data > 192)
  mask_65_192 = (~(mask_0_64 | mask_193_255))

  mat += (p0  + (p25 - p0) / 64. * data) * mask_0_64.astype(np.float32)
  mat += (p25 + (p75 - p25) / 128. * (data - 64)) * mask_65_192.astype(np.float32)
  mat += (p75 + (p100 - p75) / 63. * (data - 192)) * mask_193_255.astype(np.float32)

  return mat.T # transpose! col-major -> row-major,


# Writing,
def write_mat(output_folder,file_or_fd, m, key=''):
  """ write_mat(f, m, key='')
  Write a binary kaldi matrix to filename or stream. Supports 32bit and 64bit floats.
  Arguments:
   file_or_fd : filename of opened file descriptor for writing,
   m : the matrix to be stored,
   key (optional) : used for writing ark-file, the utterance-id gets written before the matrix.

   Example of writing single matrix:
   kaldi_io.write_mat(filename, mat)

   Example of writing arkfile:
   with open(ark_file,'w') as f:
     for key,mat in dict.iteritems():
       kaldi_io.write_mat(f, mat, key=key)
  """
  fd = open_or_fd(file_or_fd, output_folder, mode='wb')
  if sys.version_info[0] == 3: assert(fd.mode == 'wb')
  try:
    if key != '' : fd.write((key+' ').encode("latin1")) # ark-files have keys (utterance-id),
    fd.write('\0B'.encode()) # we write binary!
    # Data-type,
    if m.dtype == 'float32': fd.write('FM '.encode())
    elif m.dtype == 'float64': fd.write('DM '.encode())
    else: raise UnsupportedDataType("'%s', please use 'float32' or 'float64'" % m.dtype)
    # Dims,
    fd.write('\04'.encode())
    fd.write(struct.pack(np.dtype('uint32').char, m.shape[0])) # rows
    fd.write('\04'.encode())
    fd.write(struct.pack(np.dtype('uint32').char, m.shape[1])) # cols
    # Data,
    fd.write(m.tobytes())
  finally:
    if fd is not file_or_fd : fd.close()


#################################################
# 'Posterior' kaldi type (posteriors, confusion network, nnet1 training targets, ...)
# Corresponds to: vector<vector<tuple<int,float> > >
# - outer vector: time axis
# - inner vector: records at the time
# - tuple: int = index, float = value
#

def read_cnet_ark(file_or_fd,output_folder):
  """ Alias of function 'read_post_ark()', 'cnet' = confusion network """
  return read_post_ark(file_or_fd,output_folder)

def read_post_rxspec(file_):
  """ adaptor to read both 'ark:...' and 'scp:...' inputs of posteriors,
  """
  if file_.startswith("ark:"):
      return read_post_ark(file_)
  elif file_.startswith("scp:"):
      return read_post_scp(file_)
  else:
      print("unsupported intput type: %s" % file_)
      print("it should begint with 'ark:' or 'scp:'")
      sys.exit(1)

def read_post_scp(file_or_fd,output_folder):
  """ generator(key,post) = read_post_scp(file_or_fd)
   Returns generator of (key,post) tuples, read according to kaldi scp.
   file_or_fd : scp, gzipped scp, pipe or opened file descriptor.

   Iterate the scp:
   for key,post in kaldi_io.read_post_scp(file):
     ...

   Read scp to a 'dictionary':
   d = { key:post for key,post in kaldi_io.read_post_scp(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    for line in fd:
      (key,rxfile) = line.decode().split(' ')
      post = read_post(rxfile)
      yield key, post
  finally:
    if fd is not file_or_fd : fd.close()

def read_post_ark(file_or_fd,output_folder):
  """ generator(key,vec<vec<int,float>>) = read_post_ark(file)
   Returns generator of (key,posterior) tuples, read from ark file.
   file_or_fd : ark, gzipped ark, pipe or opened file descriptor.

   Iterate the ark:
   for key,post in kaldi_io.read_post_ark(file):
     ...

   Read ark to a 'dictionary':
   d = { key:post for key,post in kaldi_io.read_post_ark(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    key = read_key(fd)
    while key:
      post = read_post(fd)
      yield key, post
      key = read_key(fd)
  finally:
    if fd is not file_or_fd: fd.close()

def read_post(file_or_fd,output_folder):
  """ [post] = read_post(file_or_fd)
   Reads single kaldi 'Posterior' in binary format.

   The 'Posterior' is C++ type 'vector<vector<tuple<int,float> > >',
   the outer-vector is usually time axis, inner-vector are the records
   at given time,  and the tuple is composed of an 'index' (integer)
   and a 'float-value'. The 'float-value' can represent a probability
   or any other numeric value.

   Returns vector of vectors of tuples.
  """
  fd = open_or_fd(file_or_fd,output_folder)
  ans=[]
  binary = fd.read(2).decode(); assert(binary == '\0B'); # binary flag
  assert(fd.read(1).decode() == '\4'); # int-size
  outer_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)

  # Loop over 'outer-vector',
  for i in range(outer_vec_size):
    assert(fd.read(1).decode() == '\4'); # int-size
    inner_vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of records for frame (or bin)
    data = np.frombuffer(fd.read(inner_vec_size*10), dtype=[('size_idx','int8'),('idx','int32'),('size_post','int8'),('post','float32')], count=inner_vec_size)
    assert(data[0]['size_idx'] == 4)
    assert(data[0]['size_post'] == 4)
    ans.append(data[['idx','post']].tolist())

  if fd is not file_or_fd: fd.close()
  return ans


#################################################
# Kaldi Confusion Network bin begin/end times,
# (kaldi stores CNs time info separately from the Posterior).
#

def read_cntime_ark(file_or_fd,output_folder):
  """ generator(key,vec<tuple<float,float>>) = read_cntime_ark(file_or_fd)
   Returns generator of (key,cntime) tuples, read from ark file.
   file_or_fd : file, gzipped file, pipe or opened file descriptor.

   Iterate the ark:
   for key,time in kaldi_io.read_cntime_ark(file):
     ...

   Read ark to a 'dictionary':
   d = { key:time for key,time in kaldi_io.read_post_ark(file) }
  """
  fd = open_or_fd(file_or_fd,output_folder)
  try:
    key = read_key(fd)
    while key:
      cntime = read_cntime(fd)
      yield key, cntime
      key = read_key(fd)
  finally:
    if fd is not file_or_fd : fd.close()

def read_cntime(file_or_fd,output_folder):
  """ [cntime] = read_cntime(file_or_fd)
   Reads single kaldi 'Confusion Network time info', in binary format:
   C++ type: vector<tuple<float,float> >.
   (begin/end times of bins at the confusion network).

   Binary layout is '<num-bins> <beg1> <end1> <beg2> <end2> ...'

   file_or_fd : file, gzipped file, pipe or opened file descriptor.

   Returns vector of tuples.
  """
  fd = open_or_fd(file_or_fd,output_folder)
  binary = fd.read(2).decode(); assert(binary == '\0B'); # assuming it's binary

  assert(fd.read(1).decode() == '\4'); # int-size
  vec_size = np.frombuffer(fd.read(4), dtype='int32', count=1)[0] # number of frames (or bins)

  data = np.frombuffer(fd.read(vec_size*10), dtype=[('size_beg','int8'),('t_beg','float32'),('size_end','int8'),('t_end','float32')], count=vec_size)
  assert(data[0]['size_beg'] == 4)
  assert(data[0]['size_end'] == 4)
  ans = data[['t_beg','t_end']].tolist() # Return vector of tuples (t_beg,t_end),

  if fd is not file_or_fd : fd.close()
  return ans


#################################################
# Segments related,
#

# Segments as 'Bool vectors' can be handy,
# - for 'superposing' the segmentations,
# - for frame-selection in Speaker-ID experiments,
def read_segments_as_bool_vec(segments_file):
  """ [ bool_vec ] = read_segments_as_bool_vec(segments_file)
   using kaldi 'segments' file for 1 wav, format : '<utt> <rec> <t-beg> <t-end>'
   - t-beg, t-end is in seconds,
   - assumed 100 frames/second,
  """
  segs = np.loadtxt(segments_file, dtype='object,object,f,f', ndmin=1)
  # Sanity checks,
  assert(len(segs) > 0) # empty segmentation is an error,
  assert(len(np.unique([rec[1] for rec in segs ])) == 1) # segments with only 1 wav-file,
  # Convert time to frame-indexes,
  start = np.rint([100 * rec[2] for rec in segs]).astype(int)
  end = np.rint([100 * rec[3] for rec in segs]).astype(int)
  # Taken from 'read_lab_to_bool_vec', htk.py,
  frms = np.repeat(np.r_[np.tile([False,True], len(end)), False],
                   np.r_[np.c_[start - np.r_[0, end[:-1]], end-start].flat, 0])
  assert np.sum(end-start) == np.sum(frms)
  return frms