讀取數據
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1015.csv')
檢視資料結構
print(df.info())
print(df.head()) # 顯示前幾行數據
檢查缺失值
print(df.isnull().sum())
基本敘述統計
print(df.describe())
類別變數的分佈
print(df['Industry'].value_counts())
數據的分佈和統計圖形
import matplotlib.pyplot as plt
import pandas as pd
# 假設 df 已包含日期資料
# 轉換日期欄位為 datetime 格式
df['Date'] = pd.to_datetime(df['Date'])
# 選擇 2024-10-15 或最近一天的數據
target_date = pd.Timestamp('2024-10-15')
# 找出 <= 2024-10-15 的最近一日數據
df_filtered = df[df['Date'] <= target_date].sort_values(by='Date', ascending=False).groupby('Company Name').first().reset_index()
# 將 Market Cap 的單位轉換為百萬
df_filtered['Market Cap (USD mn)'] = df_filtered['Market Cap (USD)'] / 1e6
# 繪製長條圖
plt.figure(figsize=(10, 6))
bars = plt.bar(df_filtered['Company Name'], df_filtered['Market Cap (USD mn)'])
# 設定 X 軸標籤並旋轉 90 度
plt.xticks(rotation=90)
# 添加 Market Cap 數值在每個長條的上方,並旋轉 90 度
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}',
ha='center', va='bottom', rotation=90)
# 設定 Y 軸標籤
plt.ylabel('Market Cap (USD mn)')
# 設定標題
plt.title('Market Cap of Companies on or before 2024-10-15 (USD mn)')
# 顯示圖形
plt.tight_layout()
plt.show()
公司代碼簡寫修正
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
# 假設 df 已包含日期資料
# 轉換日期欄位為 datetime 格式
df['Date'] = pd.to_datetime(df['Date'])
# 選擇 2024-10-15 或最近一天的數據
target_date = pd.Timestamp('2024-10-15')
# 找出 <= 2024-10-15 的最近一日數據
df_filtered = df[df['Date'] <= target_date].sort_values(by='Date', ascending=False).groupby('Company Name').first().reset_index()
# 將 Market Cap 的單位轉換為百萬
df_filtered['Market Cap (USD mn)'] = df_filtered['Market Cap (USD)'] / 1e6
# 使用公司簡稱作為 X 軸標籤 (可手動定義簡稱)
company_abbr = {
'ASML Holding N.V.': 'ASML',
'Advanced Micro Devices, Inc.': 'AMD',
'Analog Devices, Inc.': 'ADI',
'Applied Materials, Inc.': 'AMAT',
'Arm Holdings plc': 'ARM',
'Broadcom Inc.': 'AVGO',
'Intel Corporation': 'INTC',
'KLA Corporation': 'KLA',
'Lam Research Corporation': 'LRCX',
'Marvell Technology, Inc.': 'MRVL',
'MediaTek Inc.': 'MTK',
'Micron Technology, Inc.': 'MU',
'NVIDIA Corporation': 'NVDA',
'QUALCOMM Incorporated': 'QCOM',
'SK hynix Inc.': 'SKH',
'Samsung Electronics Co., Ltd.': 'SEC',
'Synopsys, Inc.': 'SNPS',
'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
'Texas Instruments Incorporated': 'TXN',
'Tokyo Electron Limited': 'TEL'
}
df_filtered['Company Abbr'] = df_filtered['Company Name'].map(company_abbr)
# 繪製長條圖
plt.figure(figsize=(10, 6))
bars = plt.bar(df_filtered['Company Abbr'], df_filtered['Market Cap (USD mn)'])
# 設定 X 軸標籤並旋轉 90 度
plt.xticks(rotation=90)
# 設定 Y 軸範圍和千分號格式
plt.ylim(10000, 4000000)
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))
# 添加 Market Cap 數值在每個長條的上方,並旋轉 90 度
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:,.2f}',
ha='center', va='bottom', rotation=90)
# 設定 Y 軸標籤
plt.ylabel('Market Cap (USD mn)')
# 設定標題
plt.title('Market Cap of Companies on or before 2024-10-15 (USD mn)')
# 顯示圖形
plt.tight_layout()
plt.show()
安裝 Cartopy ( 英國氣象局地圖包 )
pip install cartopyfrom
畫一張地圖
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import pandas as pd
# 20 家公司總部位置資料
data = {
'Company Name': ['NVIDIA', 'ASML', 'TSMC', 'Samsung', 'Intel', 'Qualcomm', 'Broadcom',
'Micron', 'Texas Instruments', 'SK hynix', 'MediaTek', 'Applied Materials',
'AMD', 'KLA Corporation', 'Lam Research', 'Analog Devices', 'Arm Holdings',
'Synopsys', 'Tokyo Electron', 'Marvell'],
'Latitude': [37.7749, 52.0907, 25.0330, 37.5665, 37.3875, 32.7157, 37.4848,
43.6150, 32.7775, 37.4563, 24.7945, 37.6624, 37.7510, 37.4102,
37.4674, 42.3601, 51.5074, 37.3859, 35.6895, 37.5337],
'Longitude': [-122.4194, 5.1214, 121.5654, 126.9780, -122.0575, -117.1611, -122.1483,
-116.2023, -96.7970, 127.0419, 120.9615, -121.8747, -122.0312, -122.0595,
-121.9630, -71.0589, -0.1276, -122.0867, 139.6917, -122.2712],
'Funding Year': [1993, 1984, 1987, 1969, 1968, 1985, 1961,
1978, 1951, 1983, 1997, 1967, 1969, 1975,
1980, 1965, 1990, 1986, 1963, 1995],
'Market Cap (USD mn)': [3228148, 287205, 1824631, 799625, 821929, 193936, 157902,
115664, 182463, 99626, 65306, 154747, 253519, 196841,
94818, 111213, 57402, 75064, 84342, 68784]
}
# 創建DataFrame
df = pd.DataFrame(data)
# 創建地圖,設定 16:9 比例 (16 x 9 英吋)
fig, ax = plt.subplots(figsize=(16, 9), subplot_kw={'projection': ccrs.PlateCarree()})
ax.coastlines() # 繪製海岸線
# 設置地圖的視野範圍 (經度從 -130 到 150,緯度從 20 到 60)
ax.set_extent([-130, 150, -40, 80], crs=ccrs.PlateCarree())
# 標註公司位置,根據不同區域調整偏移
for idx, row in df.iterrows():
x_offset = 2 if row['Longitude'] < 0 else -2 # 西半球向右偏移,東半球向左偏移
y_offset = 0.5 if row['Latitude'] > 35 else -0.5 # 北緯度地區向上偏移,南緯度向下
ax.scatter(row['Longitude'], row['Latitude'], color='blue', s=100, transform=ccrs.PlateCarree()) # 標註地點
ax.text(row['Longitude'] + x_offset, row['Latitude'] + y_offset,
f"{row['Funding Year']}/{row['Company Name']}\n{row['Market Cap (USD mn)']:,} mn",
fontsize=8, ha='right', va='bottom', transform=ccrs.PlateCarree())
# 設定標題並顯示圖形
plt.title('TOP 20 Semiconductor Companies')
plt.tight_layout()
# 顯示圖形
plt.show()
新增一個欄位_Colse_Change %
import pandas as pd
# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1015.csv'
output_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
# 讀取 CSV 文件
df = pd.read_csv(input_file)
# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100
# 保存結果為新的 CSV 文件
df.to_csv(output_file, index=False)
print(f"新文件已保存為: {output_file}")
相關性分析
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# 定義公司簡稱
company_abbr = {
'ASML Holding N.V.': 'ASML',
'Advanced Micro Devices, Inc.': 'AMD',
'Analog Devices, Inc.': 'ADI',
'Applied Materials, Inc.': 'AMAT',
'Arm Holdings plc': 'ARM',
'Broadcom Inc.': 'AVGO',
'Intel Corporation': 'INTC',
'KLA Corporation': 'KLA',
'Lam Research Corporation': 'LRCX',
'Marvell Technology, Inc.': 'MRVL',
'MediaTek Inc.': 'MTK',
'Micron Technology, Inc.': 'MU',
'NVIDIA Corporation': 'NVDA',
'QUALCOMM Incorporated': 'QCOM',
'SK hynix Inc.': 'SKH',
'Samsung Electronics Co., Ltd.': 'SEC',
'Synopsys, Inc.': 'SNPS',
'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
'Texas Instruments Incorporated': 'TXN',
'Tokyo Electron Limited': 'TEL'
}
# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)
# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)
# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100
# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')
# 計算 Pearson 檢定的相關性矩陣
correlation_matrix = pivot_data.corr(method='pearson')
# 打印相關性矩陣
print(correlation_matrix)
# 繪製相關性矩陣的熱力圖
plt.figure(figsize=(16, 9))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)
# 設置標題和標籤
plt.title('Pearson Correlation of Close Price Change % between Companies')
plt.xlabel('Company')
plt.ylabel('Company')
# 顯示圖形
plt.tight_layout()
plt.show()
Company ADI AMAT AMD ARM ASML AVGO INTC \
Company
ADI 1.000000 0.539530 0.419531 0.430383 0.567905 0.640933 0.490412
AMAT 0.539530 1.000000 0.451153 0.495805 0.663621 0.628432 0.560144
AMD 0.419531 0.451153 1.000000 0.357472 0.447404 0.414895 0.458046
ARM 0.430383 0.495805 0.357472 1.000000 0.455873 0.440583 0.290491
ASML 0.567905 0.663621 0.447404 0.455873 1.000000 0.586597 0.554499
AVGO 0.640933 0.628432 0.414895 0.440583 0.586597 1.000000 0.496100
INTC 0.490412 0.560144 0.458046 0.290491 0.554499 0.496100 1.000000
KLA 0.504909 0.626665 0.405822 0.488126 0.637729 0.594638 0.488204
LRCX 0.512741 0.626574 0.404138 0.505766 0.632134 0.615472 0.467907
MRVL 0.563322 0.535484 0.458511 0.463141 0.509472 0.548569 0.472041
MTK 0.066334 0.081401 0.080111 0.156635 0.092285 0.086666 0.068443
MU 0.457881 0.507492 0.441036 0.415706 0.485258 0.526608 0.491098
NVDA 0.521818 0.550850 0.477973 0.443273 0.523875 0.543118 0.488982
QCOM 0.418520 0.454166 0.313115 0.517447 0.442484 0.552284 0.419880
SEC 0.110245 0.107646 0.113304 0.082084 0.122386 0.092873 0.116963
SKH 0.088178 0.100318 0.103451 0.145664 0.121854 0.100868 0.099498
SNPS 0.404273 0.431426 0.313400 0.436885 0.429869 0.547774 0.389951
TEL 0.117055 0.120653 0.109644 0.096048 0.155024 0.095427 0.120924
TSMC 0.078147 0.078043 0.101324 0.134735 0.113671 0.117549 0.087504
TXN 0.585316 0.575200 0.476387 0.428352 0.592432 0.634873 0.575910
Company KLA LRCX MRVL MTK MU NVDA QCOM \
Company
ADI 0.504909 0.512741 0.563322 0.066334 0.457881 0.521818 0.418520
AMAT 0.626665 0.626574 0.535484 0.081401 0.507492 0.550850 0.454166
AMD 0.405822 0.404138 0.458511 0.080111 0.441036 0.477973 0.313115
ARM 0.488126 0.505766 0.463141 0.156635 0.415706 0.443273 0.517447
ASML 0.637729 0.632134 0.509472 0.092285 0.485258 0.523875 0.442484
AVGO 0.594638 0.615472 0.548569 0.086666 0.526608 0.543118 0.552284
INTC 0.488204 0.467907 0.472041 0.068443 0.491098 0.488982 0.419880
KLA 1.000000 0.590831 0.528312 0.062027 0.450090 0.544409 0.419462
LRCX 0.590831 1.000000 0.540663 0.075748 0.440629 0.541526 0.417682
MRVL 0.528312 0.540663 1.000000 0.067943 0.455867 0.522725 0.451298
MTK 0.062027 0.075748 0.067943 1.000000 0.102163 0.071548 0.056503
MU 0.450090 0.440629 0.455867 0.102163 1.000000 0.487872 0.344116
NVDA 0.544409 0.541526 0.522725 0.071548 0.487872 1.000000 0.410904
QCOM 0.419462 0.417682 0.451298 0.056503 0.344116 0.410904 1.000000
SEC 0.099305 0.123324 0.134387 0.260812 0.150619 0.097464 0.075695
SKH 0.082830 0.111571 0.111376 0.230734 0.168063 0.099142 0.060098
SNPS 0.405762 0.404390 0.418394 0.049155 0.343242 0.420709 0.323752
TEL 0.118099 0.135675 0.137325 0.267444 0.132956 0.116851 0.085584
TSMC 0.071070 0.090963 0.110366 0.432079 0.106721 0.097365 0.047762
TXN 0.522175 0.541465 0.529827 0.066131 0.538354 0.513220 0.430569
Company SEC SKH SNPS TEL TSMC TXN
Company
ADI 0.110245 0.088178 0.404273 0.117055 0.078147 0.585316
AMAT 0.107646 0.100318 0.431426 0.120653 0.078043 0.575200
AMD 0.113304 0.103451 0.313400 0.109644 0.101324 0.476387
ARM 0.082084 0.145664 0.436885 0.096048 0.134735 0.428352
ASML 0.122386 0.121854 0.429869 0.155024 0.113671 0.592432
AVGO 0.092873 0.100868 0.547774 0.095427 0.117549 0.634873
INTC 0.116963 0.099498 0.389951 0.120924 0.087504 0.575910
KLA 0.099305 0.082830 0.405762 0.118099 0.071070 0.522175
LRCX 0.123324 0.111571 0.404390 0.135675 0.090963 0.541465
MRVL 0.134387 0.111376 0.418394 0.137325 0.110366 0.529827
MTK 0.260812 0.230734 0.049155 0.267444 0.432079 0.066131
MU 0.150619 0.168063 0.343242 0.132956 0.106721 0.538354
NVDA 0.097464 0.099142 0.420709 0.116851 0.097365 0.513220
QCOM 0.075695 0.060098 0.323752 0.085584 0.047762 0.430569
SEC 1.000000 0.512639 0.081573 0.408271 0.423823 0.113037
SKH 0.512639 1.000000 0.067578 0.301325 0.289684 0.077035
SNPS 0.081573 0.067578 1.000000 0.109332 0.065144 0.402536
TEL 0.408271 0.301325 0.109332 1.000000 0.395513 0.119600
TSMC 0.423823 0.289684 0.065144 0.395513 1.000000 0.088416
TXN 0.113037 0.077035 0.402536 0.119600 0.088416 1.000000
Pearson 檢定
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# 定義公司簡稱
company_abbr = {
'ASML Holding N.V.': 'ASML',
'Advanced Micro Devices, Inc.': 'AMD',
'Analog Devices, Inc.': 'ADI',
'Applied Materials, Inc.': 'AMAT',
'Arm Holdings plc': 'ARM',
'Broadcom Inc.': 'AVGO',
'Intel Corporation': 'INTC',
'KLA Corporation': 'KLA',
'Lam Research Corporation': 'LRCX',
'Marvell Technology, Inc.': 'MRVL',
'MediaTek Inc.': 'MTK',
'Micron Technology, Inc.': 'MU',
'NVIDIA Corporation': 'NVDA',
'QUALCOMM Incorporated': 'QCOM',
'SK hynix Inc.': 'SKH',
'Samsung Electronics Co., Ltd.': 'SEC',
'Synopsys, Inc.': 'SNPS',
'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
'Texas Instruments Incorporated': 'TXN',
'Tokyo Electron Limited': 'TEL'
}
# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)
# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)
# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100
# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')
# 創建一個空的 DataFrame 來存放結果
results = pd.DataFrame(columns=['Company 1', 'Company 2', 'Pearson R', 'P-value'])
# 計算每兩家公司之間的 Pearson R 和 P 值
companies = pivot_data.columns
for i in range(len(companies)):
for j in range(i+1, len(companies)): # 避免重複計算相同的組合
comp1 = companies[i]
comp2 = companies[j]
# 去除 NaN 值,避免影響計算
valid_data = pivot_data[[comp1, comp2]].dropna()
if not valid_data.empty:
# 計算 Pearson 相關性和 P 值
r_value, p_value = pearsonr(valid_data[comp1], valid_data[comp2])
# 將結果存入 DataFrame
new_row = pd.DataFrame({
'Company 1': [comp1],
'Company 2': [comp2],
'Pearson R': [r_value],
'P-value': [p_value]
})
results = pd.concat([results, new_row], ignore_index=True)
# 顯示結果
print(results)
# 保存為 CSV 文件,如果需要保存結果到文件
output_file = '/content/drive/My Drive/Semicon_Analysis/pearson_correlation_results.csv'
results.to_csv(output_file, index=False)
results = pd.concat([results, new_row], ignore_index=True)
Company 1 Company 2 Pearson R P-value
0 ADI AMAT 0.539530 0.000000e+00
1 ADI AMD 0.419531 0.000000e+00
2 ADI ARM 0.430383 9.766949e-14
3 ADI ASML 0.567905 0.000000e+00
4 ADI AVGO 0.640933 0.000000e+00
.. ... ... ... ...
185 SNPS TSMC 0.065144 4.763340e-07
186 SNPS TXN 0.402536 8.175838e-318
187 TEL TSMC 0.395513 2.321578e-221
188 TEL TXN 0.119600 1.692722e-20
189 TSMC TXN 0.088416 7.882611e-12
[190 rows x 4 columns]
重新提供 Heatmap of Price Change% , 基於 P-value <0.05
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import numpy as np
# 定義公司簡稱
company_abbr = {
'ASML Holding N.V.': 'ASML',
'Advanced Micro Devices, Inc.': 'AMD',
'Analog Devices, Inc.': 'ADI',
'Applied Materials, Inc.': 'AMAT',
'Arm Holdings plc': 'ARM',
'Broadcom Inc.': 'AVGO',
'Intel Corporation': 'INTC',
'KLA Corporation': 'KLA',
'Lam Research Corporation': 'LRCX',
'Marvell Technology, Inc.': 'MRVL',
'MediaTek Inc.': 'MTK',
'Micron Technology, Inc.': 'MU',
'NVIDIA Corporation': 'NVDA',
'QUALCOMM Incorporated': 'QCOM',
'SK hynix Inc.': 'SKH',
'Samsung Electronics Co., Ltd.': 'SEC',
'Synopsys, Inc.': 'SNPS',
'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
'Texas Instruments Incorporated': 'TXN',
'Tokyo Electron Limited': 'TEL'
}
# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)
# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)
# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100
# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')
# 創建一個空的 DataFrame 來存放結果
correlation_matrix = pd.DataFrame(np.zeros((len(pivot_data.columns), len(pivot_data.columns))),
columns=pivot_data.columns, index=pivot_data.columns)
# 計算每兩家公司之間的 Pearson R 和 P 值
for i in range(len(pivot_data.columns)):
for j in range(i+1, len(pivot_data.columns)): # 避免重複計算相同的組合
comp1 = pivot_data.columns[i]
comp2 = pivot_data.columns[j]
# 去除 NaN 值,避免影響計算
valid_data = pivot_data[[comp1, comp2]].dropna()
if not valid_data.empty:
# 計算 Pearson 相關性和 P 值
r_value, p_value = pearsonr(valid_data[comp1], valid_data[comp2])
# 只在 p-value < 0.05 的情況下填入 Pearson R 值,否則設為 NaN
if p_value < 0.05:
correlation_matrix.loc[comp1, comp2] = r_value
correlation_matrix.loc[comp2, comp1] = r_value
else:
correlation_matrix.loc[comp1, comp2] = np.nan
correlation_matrix.loc[comp2, comp1] = np.nan
# 繪製相關性矩陣的熱力圖,只顯示符合條件的公司 P Value < 0.05
plt.figure(figsize=(16, 9))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5, mask=correlation_matrix.isnull())
# 設置標題和標籤
plt.title('Heatmap of Company Close Price Change % (P-value < 0.05)')
plt.xlabel('Company')
plt.ylabel('Company')
# 顯示圖形
plt.tight_layout()
plt.show()