email_id,количество_слов,время_ответа,ссылки,spam
1,150,2,3,0
2,80,5,0,1
3,220,1,1,0
4,95,4,2,1
5,300,0,5,0
...
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import export_text
import matplotlib.pyplot as plt
data = pd.read_csv("spam_data.csv")
data.fillna(method='ffill', inplace=True)
X = data.drop("spam", axis=1)
y = data["spam"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Верно угаданные метки (accuracy): {accuracy:.4f}")
print(f"Точность: {precision:.4f}")
print(f"Полнота: {recall:.4f}")
print(f"F1-мера: {f1:.4f}")
tree_rules = export_text(model, feature_names=X.columns.tolist())
print("\nВизуализация дерева решений в формате ASCII:\n")
print(tree_rules)
plt.figure(figsize=(20,15))
plot_tree(model, feature_names=X.columns, class_names=['Not Spam', 'Spam'], filled=True, rounded=True)
plt.show()
Верно угаданные метки (accuracy): 0.8500
Точность: 1.0000
Полнота: 0.6250
F1-мера: 0.7692
Визуализация дерева решений в формате ASCII:
|--- ссылки <= 1.50
| |--- количество_слов <= 82.50
| | |--- ссылки <= 0.50
| | | |--- class: 0.0
| | |--- ссылки > 0.50
| | | |--- email_id <= 20.00
| | | | |--- class: 0.0
| | | |--- email_id > 20.00
| | | | |--- class: 1.0
| |--- количество_слов > 82.50
| | |--- class: 0.0
|--- ссылки > 1.50
| |--- email_id <= 27.50
| | |--- class: 1.0
| |--- email_id > 27.50
| | |--- количество_слов <= 87.50
| | | |--- class: 1.0
| | |--- количество_слов > 87.50
| | | |--- email_id <= 50.50
| | | | |--- class: 0.0
| | | |--- email_id > 50.50
| | | | |--- email_id <= 56.50
| | | | | |--- class: 1.0
| | | | |--- email_id > 56.50
| | | | | |--- email_id <= 61.50
| | | | | | |--- class: 0.0
| | | | | |--- email_id > 61.50
| | | | | | |--- email_id <= 68.50
| | | | | | | |--- class: 1.0
| | | | | | |--- email_id > 68.50
| | | | | | | |--- ссылки <= 3.50
| | | | | | | | |--- время_ответа <= 6.50
| | | | | | | | | |--- время_ответа <= 2.50
| | | | | | | | | | |--- email_id <= 78.50
| | | | | | | | | | | |--- class: 0.0 |
| | | | | | | | | |--- email_id > 78.50
| | | | | | | | | | | |--- class: 1.0
| | | | | | | | | |--- время_ответа > 2.50
| | | | | | | | | | |--- class: 0.0
| | | | | | | | |--- время_ответа > 6.50
| | | | | | | | | |--- class: 1.0
| | | | | | | |--- ссылки > 3.50
| | | | | | | | |--- количество_слов <= 102.50
| | | | | | | | | |--- class: 0.0
| | | | | | | | |--- количество_слов > 102.50
| | | | | | | | | |--- class: 1.0