我的
data_tokenization
课程在一个包中
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np
# from datasets import load_dataset
# dataset = load_dataset("cfilt/iitb-english-hindi")
class data_tokenization:
def __init__(self):
current_dir = os.getcwd()
txt_folder = os.path.join(current_dir,r"data\\txt")
dict_folder = os.path.join(current_dir,r"data\\dict")
padded_folder = os.path.join(current_dir,r"data\\padded")
self.current_dir = current_dir
self.txt_folder = txt_folder
self.dict_folder = dict_folder
self.padded_folder = padded_folder
#getting max sequence
#loading dictionaries
def get_source_dict(self):
source_dict = {}
with open(os.path.join(dict_folder,"source_dict.txt"),'r',encoding='utf-8') as f:
source_dict = json.load(f)
return source_dict
def get_target_dict(self):
target_dict = {}
with open(os.path.join(dict_folder,"target_dict.txt"),'r',encoding='utf-8') as f:
target_dict = json.load(f)
return target_dict
def max_length(self):
max_scent_len = 0
for filename in ['source_full.txt','target_full.txt']:
with open(os.path.join(txt_folder,filename),'r',encoding='utf-8') as f:
for line in f:
if(len(line)>max_scent_len):
max_scent_len = len(line)
return max_scent_len
#loading scentence from file
def load_token_sequences(self,filename,dict):
with open(os.path.join(txt_folder,filename),'r',encoding='utf-8') as f:
seq = []
for line in f:
seq.append(tokenizer(line,dict))
return seq
#tokenizing scentence
def tokenizer(self,line,dict):
seq=[]
for word in line.split():
seq.append(dict.get(word,1))
return seq
我在另一个名为
data_modeling.py
的文件中调用此标记化类,这两个文件位于同一文件夹中
data_modeling.py
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow as tf
import numpy as np
import json
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from data_tokenization import data_tokenization
current_dir = os.getcwd()
txt_folder = os.path.join(current_dir,r"data\\txt")
artifact_folder = os.path.join(current_dir,"artifacts")
tokenizer = data_tokenization()
source_dictionary = tokenizer.get_source_dict()
max_index = max(source_dictionary.values())
source_count= len(source_dictionary)
target_dictionary = tokenizer.get_target_dict()
target_count=len(target_dictionary)
#source embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=max_index+1, output_dim=128))
model.add(LSTM(64,return_sequences=True))
model.add(LSTM(32,return_sequences=True))
#model.add(Dense(64, activation='relu'))
model.add(Dense(target_count,activation='softmax'))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, tokenizer.max_length()))
model.summary()
valid_src_seq=[]
with open(os.path.join(txt_folder,"source_valid.txt"), 'r',encoding='utf-8') as f:
for line in f:
valid_src_seq.append(tokenizer.tokenizer(line,source_dictionary))
print(valid_src_seq[0:5])
# target_sequence_padded_tr = np.array([[target_dictonary.get(word, 0) for word in sequence] for sequence in target_sequence_padded_tr])
# target_sequence_padded_v = np.array([[target_dictonary.get(word, 0) for word in sequence] for sequence in target_sequence_padded_v])
# source_sequence_padded_tr_subset = source_sequence_padded_tr[:5000]
# target_sequence_padded_tr_subset = target_sequence_padded_tr[:5000]
early_stopping = EarlyStopping(
monitor='val_loss',
patience=1,
min_delta=0.001,
restore_best_weights=True
)
#model.fit(source_sequence_padded_tr, target_sequence_padded_tr, batch_size=10,verbose=1 ,callbacks=[early_stopping], epochs=10, validation_data=(source_sequence_padded_v, target_sequence_padded_v))
#target_sequence_padded_te = np.array([[target_dictonary.get(word, 0) for word in sequence] for sequence in target_sequence_padded_te])
# test_loss, test_acc = model.evaluate(source_sequence_padded_te, target_sequence_padded_te)
# print(f"Test Accuracy: {test_acc}")
# model.save(os.path.join(artifact_folder, "translator_model.h5"))
我收到错误
Traceback (most recent call last):
File "d:\WorkSpace\H2ETranslator\models\data_modeling.py", line 20, in <module>
source_dictionary = tokenizer.get_source_dict()
^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'data_tokenization' object has no attribute 'get_source_dict'
我尝试将
tokenizer = data_tokenization()
更改为tokenizer = data_tokenization.data_tokenization()
,但它没有改变结果,想了解原因以及如何解决这个问题
从评论中,我意识到缩进导致我的所有方法都进入了
__init__()
方法内部,修复了它