23

 def tokenize_and_convert(file_path):

    token_dict = {}

    unique_id = 1


    with open(file_path, 'r') as file:

        lines = file.readlines()


    tokenized_lines = []

    for line in lines:

        tokens = line.split()

        converted_tokens = []

        for token in tokens:

            if token not in token_dict:

                token_dict[token] = unique_id

                unique_id += 1

            converted_tokens.append(token_dict[token])

        tokenized_lines.append(converted_tokens)


    max_length = max(len(line) for line in tokenized_lines)

    padded_lines = [line + [0] * (max_length - len(line)) for line in tokenized_lines]


    return padded_lines, token_dict


file_path = 'example.txt'

print(tokenize_and_convert(file_path))


Comments