在 Ubuntu 上搭建 Python 数据分析环境并进行 EDA 的实操指南
一 环境准备与安装
sudo apt update && sudo apt upgrade -ysudo apt install python3 python3-pip -ypython3 -m venv data_envsource data_env/bin/activatepip install numpy pandas matplotlib seaborn jupyterpip install scikit-learnpip install "dask[complete]"wget https://repo.anaconda.com/archive/Anaconda3-2024.05-Linux-x86_64.sh && bash Anaconda3-2024.05-Linux-x86_64.shsource ~/.bashrc二 数据获取与清洗
df = pd.read_csv('data.csv')df = pd.read_excel('data.xlsx', sheet_name='Sheet1')engine = create_engine('mysql+pymysql://user:password@host:port/db')data = pd.read_sql('SELECT * FROM table_name', engine)df.dropna(axis=0, how='any', inplace=True)df['col'].fillna(df['col'].mean(), inplace=True)df['date'] = pd.to_datetime(df['date'])三 探索性数据分析与可视化
df.describe()df['col'].hist(); plt.show()grouped = df.groupby('category')['value'].mean()pivot = df.pivot_table(values='value', index='row', columns='col')sns.scatterplot(x='total_bill', y='tip', data=tips); plt.show()sns.boxplot(x='species', y='petal_length', data=iris); plt.show()四 交互式分析与远程访问
pip install notebookjupyter notebook(默认浏览器打开 http://localhost:8888)jupyter notebook --generate-configc.NotebookApp.ip = '0.0.0.0'c.NotebookApp.open_browser = Falsec.NotebookApp.port = 8888jupyter notebook(如以 root 运行:jupyter notebook --allow-root)http://服务器IP:8888 访问五 常见问题与优化建议