在 Debian 上使用 Python 进行数据分析与可视化
一 环境准备与核心工具
二 标准工作流程
三 完整示例 Titanic 入门分析
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# 1) 读取数据
df = pd.read_csv('titanic.csv') # 请确保同目录下有该文件
# 2) 数据清洗
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop_duplicates(inplace=True)
# 3) 描述性统计
print(df.describe(include='all').to_string())
# 4) 可视化:生存率按性别
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Survival Rate by Gender')
plt.ylabel('Survival Rate')
plt.show()
# 5) 可视化:年龄分布箱线图
sns.boxplot(x='Pclass', y='Age', hue='Sex', data=df)
plt.title('Age Distribution by Class and Sex')
plt.show()
# 6) 建模与评估
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
df = pd.get_dummies(df, columns=['Sex'], drop_first=True) # One-hot 编码
X = df[features]
y = df['Survived']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_tr, y_tr)
preds = clf.predict(X_te)
print(f'Accuracy: {accuracy_score(y_te, preds):.3f}')
四 进阶与扩展
五 常见问题与建议