Lade eingelegte Klassifiziererdaten: Vokabular nicht angepasst Fehler
Ich habe alle zugehörigen Fragen hier gelesen, konnte aber keine funktionierende Lösung finden:
Meine Klassifikatorerstellung:
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: english_stemmer.stemWords(analyzer(doc))
tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')
def create_tfidf(f):
docs = []
targets = []
with open(f, "r") as sentences_file:
reader = csv.reader(sentences_file, delimiter=';')
reader.next()
for row in reader:
docs.append(row[1])
targets.append(row[0])
tfidf_matrix = tf.fit_transform(docs)
print tfidf_matrix.shape
# print tf.get_feature_names()
return tfidf_matrix, targets
X,y = create_tfidf("l0.csv")
clf = LinearSVC().fit(X,y)
_ = joblib.dump(clf, 'linearL0_3gram_100K.pkl', compress=9)
Dieses Bit funktioniert und generiert die .pkl-Datei, die ich dann in einem anderen Skript als solche zu verwenden versuche:
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: english_stemmer.stemWords(analyzer(doc))
tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')
clf = joblib.load('linearL0_3gram_100K.pkl')
print clf
test = "My super elaborate test string to test predictions"
print test + clf.predict(tf.transform([test]))[0]
Und ich bekomme ValueError:Vocabulary wasn't fitted or is empty!
Bearbeiten: Fehlerrückverfolgung wie angefordert
File "classifier.py", line 27, in <module>
print test + clf.predict(tf.transform([test]))[0]
File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 1313, in transform
X = super(TfidfVectorizer, self).transform(raw_documents)
File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 850, in transform
self._check_vocabulary()
File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 271, in _check_vocabulary
check_is_fitted(self, 'vocabulary_', msg=msg),
File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 627, in check_is_fitted
raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.utils.validation.NotFittedError: StemmedTfidfVectorizer - Vocabulary wasn't fitted.