23
def tokenize_and_convert(file_path):
token_dict = {}
unique_id = 1
with open(file_path, 'r') as file:
lines = file.readlines()
tokenized_lines = []
for line in lines:
tokens = line.split()
converted_tokens = []
for token in tokens:
if token not in token_dict:
token_dict[token] = unique_id
unique_id += 1
converted_tokens.append(token_dict[token])
tokenized_lines.append(converted_tokens)
max_length = max(len(line) for line in tokenized_lines)
padded_lines = [line + [0] * (max_length - len(line)) for line in tokenized_lines]
return padded_lines, token_dict
file_path = 'example.txt'
print(tokenize_and_convert(file_path))
Comments
Post a Comment