1105_02_Dataset and Statistics Description

1105_02_Dataset and Statistics Description
Photo by K. Mitch Hodge / Unsplash

讀取數據

from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1015.csv')

檢視資料結構

print(df.info())
print(df.head())  # 顯示前幾行數據

檢查缺失值

print(df.isnull().sum())

基本敘述統計

print(df.describe())

類別變數的分佈

print(df['Industry'].value_counts())

數據的分佈和統計圖形

import matplotlib.pyplot as plt
import pandas as pd

# 假設 df 已包含日期資料
# 轉換日期欄位為 datetime 格式
df['Date'] = pd.to_datetime(df['Date'])

# 選擇 2024-10-15 或最近一天的數據
target_date = pd.Timestamp('2024-10-15')

# 找出 <= 2024-10-15 的最近一日數據
df_filtered = df[df['Date'] <= target_date].sort_values(by='Date', ascending=False).groupby('Company Name').first().reset_index()

# 將 Market Cap 的單位轉換為百萬
df_filtered['Market Cap (USD mn)'] = df_filtered['Market Cap (USD)'] / 1e6

# 繪製長條圖
plt.figure(figsize=(10, 6))
bars = plt.bar(df_filtered['Company Name'], df_filtered['Market Cap (USD mn)'])

# 設定 X 軸標籤並旋轉 90 度
plt.xticks(rotation=90)

# 添加 Market Cap 數值在每個長條的上方,並旋轉 90 度
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}',
             ha='center', va='bottom', rotation=90)

# 設定 Y 軸標籤
plt.ylabel('Market Cap (USD mn)')

# 設定標題
plt.title('Market Cap of Companies on or before 2024-10-15 (USD mn)')

# 顯示圖形
plt.tight_layout()
plt.show()

公司代碼簡寫修正

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd

# 假設 df 已包含日期資料
# 轉換日期欄位為 datetime 格式
df['Date'] = pd.to_datetime(df['Date'])

# 選擇 2024-10-15 或最近一天的數據
target_date = pd.Timestamp('2024-10-15')

# 找出 <= 2024-10-15 的最近一日數據
df_filtered = df[df['Date'] <= target_date].sort_values(by='Date', ascending=False).groupby('Company Name').first().reset_index()

# 將 Market Cap 的單位轉換為百萬
df_filtered['Market Cap (USD mn)'] = df_filtered['Market Cap (USD)'] / 1e6

# 使用公司簡稱作為 X 軸標籤 (可手動定義簡稱)
company_abbr = {
    'ASML Holding N.V.': 'ASML',
    'Advanced Micro Devices, Inc.': 'AMD',
    'Analog Devices, Inc.': 'ADI',
    'Applied Materials, Inc.': 'AMAT',
    'Arm Holdings plc': 'ARM',
    'Broadcom Inc.': 'AVGO',
    'Intel Corporation': 'INTC',
    'KLA Corporation': 'KLA',
    'Lam Research Corporation': 'LRCX',
    'Marvell Technology, Inc.': 'MRVL',
    'MediaTek Inc.': 'MTK',
    'Micron Technology, Inc.': 'MU',
    'NVIDIA Corporation': 'NVDA',
    'QUALCOMM Incorporated': 'QCOM',
    'SK hynix Inc.': 'SKH',
    'Samsung Electronics Co., Ltd.': 'SEC',
    'Synopsys, Inc.': 'SNPS',
    'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
    'Texas Instruments Incorporated': 'TXN',
    'Tokyo Electron Limited': 'TEL'
}

df_filtered['Company Abbr'] = df_filtered['Company Name'].map(company_abbr)

# 繪製長條圖
plt.figure(figsize=(10, 6))
bars = plt.bar(df_filtered['Company Abbr'], df_filtered['Market Cap (USD mn)'])

# 設定 X 軸標籤並旋轉 90 度
plt.xticks(rotation=90)

# 設定 Y 軸範圍和千分號格式
plt.ylim(10000, 4000000)
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

# 添加 Market Cap 數值在每個長條的上方,並旋轉 90 度
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:,.2f}',
             ha='center', va='bottom', rotation=90)

# 設定 Y 軸標籤
plt.ylabel('Market Cap (USD mn)')

# 設定標題
plt.title('Market Cap of Companies on or before 2024-10-15 (USD mn)')

# 顯示圖形
plt.tight_layout()
plt.show()

安裝 Cartopy ( 英國氣象局地圖包 )

pip install cartopyfrom

畫一張地圖

import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import pandas as pd

# 20 家公司總部位置資料
data = {
    'Company Name': ['NVIDIA', 'ASML', 'TSMC', 'Samsung', 'Intel', 'Qualcomm', 'Broadcom',
                     'Micron', 'Texas Instruments', 'SK hynix', 'MediaTek', 'Applied Materials',
                     'AMD', 'KLA Corporation', 'Lam Research', 'Analog Devices', 'Arm Holdings',
                     'Synopsys', 'Tokyo Electron', 'Marvell'],
    'Latitude': [37.7749, 52.0907, 25.0330, 37.5665, 37.3875, 32.7157, 37.4848,
                 43.6150, 32.7775, 37.4563, 24.7945, 37.6624, 37.7510, 37.4102,
                 37.4674, 42.3601, 51.5074, 37.3859, 35.6895, 37.5337],
    'Longitude': [-122.4194, 5.1214, 121.5654, 126.9780, -122.0575, -117.1611, -122.1483,
                  -116.2023, -96.7970, 127.0419, 120.9615, -121.8747, -122.0312, -122.0595,
                  -121.9630, -71.0589, -0.1276, -122.0867, 139.6917, -122.2712],
    'Funding Year': [1993, 1984, 1987, 1969, 1968, 1985, 1961,
                     1978, 1951, 1983, 1997, 1967, 1969, 1975,
                     1980, 1965, 1990, 1986, 1963, 1995],
    'Market Cap (USD mn)': [3228148, 287205, 1824631, 799625, 821929, 193936, 157902,
                            115664, 182463, 99626, 65306, 154747, 253519, 196841,
                            94818, 111213, 57402, 75064, 84342, 68784]
}

# 創建DataFrame
df = pd.DataFrame(data)

# 創建地圖,設定 16:9 比例 (16 x 9 英吋)
fig, ax = plt.subplots(figsize=(16, 9), subplot_kw={'projection': ccrs.PlateCarree()})
ax.coastlines()  # 繪製海岸線

# 設置地圖的視野範圍 (經度從 -130 到 150,緯度從 20 到 60)
ax.set_extent([-130, 150, -40, 80], crs=ccrs.PlateCarree())

# 標註公司位置,根據不同區域調整偏移
for idx, row in df.iterrows():
    x_offset = 2 if row['Longitude'] < 0 else -2  # 西半球向右偏移,東半球向左偏移
    y_offset = 0.5 if row['Latitude'] > 35 else -0.5  # 北緯度地區向上偏移,南緯度向下
    ax.scatter(row['Longitude'], row['Latitude'], color='blue', s=100, transform=ccrs.PlateCarree())  # 標註地點
    ax.text(row['Longitude'] + x_offset, row['Latitude'] + y_offset,
            f"{row['Funding Year']}/{row['Company Name']}\n{row['Market Cap (USD mn)']:,} mn",
            fontsize=8, ha='right', va='bottom', transform=ccrs.PlateCarree())

# 設定標題並顯示圖形
plt.title('TOP 20 Semiconductor Companies')
plt.tight_layout()

# 顯示圖形
plt.show()

新增一個欄位_Colse_Change %

import pandas as pd

# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1015.csv'
output_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'

# 讀取 CSV 文件
df = pd.read_csv(input_file)

# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100

# 保存結果為新的 CSV 文件
df.to_csv(output_file, index=False)

print(f"新文件已保存為: {output_file}")

相關性分析

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# 定義公司簡稱
company_abbr = {
    'ASML Holding N.V.': 'ASML',
    'Advanced Micro Devices, Inc.': 'AMD',
    'Analog Devices, Inc.': 'ADI',
    'Applied Materials, Inc.': 'AMAT',
    'Arm Holdings plc': 'ARM',
    'Broadcom Inc.': 'AVGO',
    'Intel Corporation': 'INTC',
    'KLA Corporation': 'KLA',
    'Lam Research Corporation': 'LRCX',
    'Marvell Technology, Inc.': 'MRVL',
    'MediaTek Inc.': 'MTK',
    'Micron Technology, Inc.': 'MU',
    'NVIDIA Corporation': 'NVDA',
    'QUALCOMM Incorporated': 'QCOM',
    'SK hynix Inc.': 'SKH',
    'Samsung Electronics Co., Ltd.': 'SEC',
    'Synopsys, Inc.': 'SNPS',
    'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
    'Texas Instruments Incorporated': 'TXN',
    'Tokyo Electron Limited': 'TEL'
}

# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)

# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)

# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100

# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')

# 計算 Pearson 檢定的相關性矩陣
correlation_matrix = pivot_data.corr(method='pearson')

# 打印相關性矩陣
print(correlation_matrix)

# 繪製相關性矩陣的熱力圖
plt.figure(figsize=(16, 9))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)

# 設置標題和標籤
plt.title('Pearson Correlation of Close Price Change % between Companies')
plt.xlabel('Company')
plt.ylabel('Company')

# 顯示圖形
plt.tight_layout()
plt.show()
Company       ADI      AMAT       AMD       ARM      ASML      AVGO      INTC  \
Company                                                                         
ADI      1.000000  0.539530  0.419531  0.430383  0.567905  0.640933  0.490412   
AMAT     0.539530  1.000000  0.451153  0.495805  0.663621  0.628432  0.560144   
AMD      0.419531  0.451153  1.000000  0.357472  0.447404  0.414895  0.458046   
ARM      0.430383  0.495805  0.357472  1.000000  0.455873  0.440583  0.290491   
ASML     0.567905  0.663621  0.447404  0.455873  1.000000  0.586597  0.554499   
AVGO     0.640933  0.628432  0.414895  0.440583  0.586597  1.000000  0.496100   
INTC     0.490412  0.560144  0.458046  0.290491  0.554499  0.496100  1.000000   
KLA      0.504909  0.626665  0.405822  0.488126  0.637729  0.594638  0.488204   
LRCX     0.512741  0.626574  0.404138  0.505766  0.632134  0.615472  0.467907   
MRVL     0.563322  0.535484  0.458511  0.463141  0.509472  0.548569  0.472041   
MTK      0.066334  0.081401  0.080111  0.156635  0.092285  0.086666  0.068443   
MU       0.457881  0.507492  0.441036  0.415706  0.485258  0.526608  0.491098   
NVDA     0.521818  0.550850  0.477973  0.443273  0.523875  0.543118  0.488982   
QCOM     0.418520  0.454166  0.313115  0.517447  0.442484  0.552284  0.419880   
SEC      0.110245  0.107646  0.113304  0.082084  0.122386  0.092873  0.116963   
SKH      0.088178  0.100318  0.103451  0.145664  0.121854  0.100868  0.099498   
SNPS     0.404273  0.431426  0.313400  0.436885  0.429869  0.547774  0.389951   
TEL      0.117055  0.120653  0.109644  0.096048  0.155024  0.095427  0.120924   
TSMC     0.078147  0.078043  0.101324  0.134735  0.113671  0.117549  0.087504   
TXN      0.585316  0.575200  0.476387  0.428352  0.592432  0.634873  0.575910   

Company       KLA      LRCX      MRVL       MTK        MU      NVDA      QCOM  \
Company                                                                         
ADI      0.504909  0.512741  0.563322  0.066334  0.457881  0.521818  0.418520   
AMAT     0.626665  0.626574  0.535484  0.081401  0.507492  0.550850  0.454166   
AMD      0.405822  0.404138  0.458511  0.080111  0.441036  0.477973  0.313115   
ARM      0.488126  0.505766  0.463141  0.156635  0.415706  0.443273  0.517447   
ASML     0.637729  0.632134  0.509472  0.092285  0.485258  0.523875  0.442484   
AVGO     0.594638  0.615472  0.548569  0.086666  0.526608  0.543118  0.552284   
INTC     0.488204  0.467907  0.472041  0.068443  0.491098  0.488982  0.419880   
KLA      1.000000  0.590831  0.528312  0.062027  0.450090  0.544409  0.419462   
LRCX     0.590831  1.000000  0.540663  0.075748  0.440629  0.541526  0.417682   
MRVL     0.528312  0.540663  1.000000  0.067943  0.455867  0.522725  0.451298   
MTK      0.062027  0.075748  0.067943  1.000000  0.102163  0.071548  0.056503   
MU       0.450090  0.440629  0.455867  0.102163  1.000000  0.487872  0.344116   
NVDA     0.544409  0.541526  0.522725  0.071548  0.487872  1.000000  0.410904   
QCOM     0.419462  0.417682  0.451298  0.056503  0.344116  0.410904  1.000000   
SEC      0.099305  0.123324  0.134387  0.260812  0.150619  0.097464  0.075695   
SKH      0.082830  0.111571  0.111376  0.230734  0.168063  0.099142  0.060098   
SNPS     0.405762  0.404390  0.418394  0.049155  0.343242  0.420709  0.323752   
TEL      0.118099  0.135675  0.137325  0.267444  0.132956  0.116851  0.085584   
TSMC     0.071070  0.090963  0.110366  0.432079  0.106721  0.097365  0.047762   
TXN      0.522175  0.541465  0.529827  0.066131  0.538354  0.513220  0.430569   

Company       SEC       SKH      SNPS       TEL      TSMC       TXN  
Company                                                              
ADI      0.110245  0.088178  0.404273  0.117055  0.078147  0.585316  
AMAT     0.107646  0.100318  0.431426  0.120653  0.078043  0.575200  
AMD      0.113304  0.103451  0.313400  0.109644  0.101324  0.476387  
ARM      0.082084  0.145664  0.436885  0.096048  0.134735  0.428352  
ASML     0.122386  0.121854  0.429869  0.155024  0.113671  0.592432  
AVGO     0.092873  0.100868  0.547774  0.095427  0.117549  0.634873  
INTC     0.116963  0.099498  0.389951  0.120924  0.087504  0.575910  
KLA      0.099305  0.082830  0.405762  0.118099  0.071070  0.522175  
LRCX     0.123324  0.111571  0.404390  0.135675  0.090963  0.541465  
MRVL     0.134387  0.111376  0.418394  0.137325  0.110366  0.529827  
MTK      0.260812  0.230734  0.049155  0.267444  0.432079  0.066131  
MU       0.150619  0.168063  0.343242  0.132956  0.106721  0.538354  
NVDA     0.097464  0.099142  0.420709  0.116851  0.097365  0.513220  
QCOM     0.075695  0.060098  0.323752  0.085584  0.047762  0.430569  
SEC      1.000000  0.512639  0.081573  0.408271  0.423823  0.113037  
SKH      0.512639  1.000000  0.067578  0.301325  0.289684  0.077035  
SNPS     0.081573  0.067578  1.000000  0.109332  0.065144  0.402536  
TEL      0.408271  0.301325  0.109332  1.000000  0.395513  0.119600  
TSMC     0.423823  0.289684  0.065144  0.395513  1.000000  0.088416  
TXN      0.113037  0.077035  0.402536  0.119600  0.088416  1.000000  

Pearson 檢定

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# 定義公司簡稱
company_abbr = {
    'ASML Holding N.V.': 'ASML',
    'Advanced Micro Devices, Inc.': 'AMD',
    'Analog Devices, Inc.': 'ADI',
    'Applied Materials, Inc.': 'AMAT',
    'Arm Holdings plc': 'ARM',
    'Broadcom Inc.': 'AVGO',
    'Intel Corporation': 'INTC',
    'KLA Corporation': 'KLA',
    'Lam Research Corporation': 'LRCX',
    'Marvell Technology, Inc.': 'MRVL',
    'MediaTek Inc.': 'MTK',
    'Micron Technology, Inc.': 'MU',
    'NVIDIA Corporation': 'NVDA',
    'QUALCOMM Incorporated': 'QCOM',
    'SK hynix Inc.': 'SKH',
    'Samsung Electronics Co., Ltd.': 'SEC',
    'Synopsys, Inc.': 'SNPS',
    'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
    'Texas Instruments Incorporated': 'TXN',
    'Tokyo Electron Limited': 'TEL'
}

# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)

# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)

# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100

# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')

# 創建一個空的 DataFrame 來存放結果
results = pd.DataFrame(columns=['Company 1', 'Company 2', 'Pearson R', 'P-value'])

# 計算每兩家公司之間的 Pearson R 和 P 值
companies = pivot_data.columns
for i in range(len(companies)):
    for j in range(i+1, len(companies)):  # 避免重複計算相同的組合
        comp1 = companies[i]
        comp2 = companies[j]

        # 去除 NaN 值,避免影響計算
        valid_data = pivot_data[[comp1, comp2]].dropna()

        if not valid_data.empty:
            # 計算 Pearson 相關性和 P 值
            r_value, p_value = pearsonr(valid_data[comp1], valid_data[comp2])

            # 將結果存入 DataFrame
            new_row = pd.DataFrame({
                'Company 1': [comp1],
                'Company 2': [comp2],
                'Pearson R': [r_value],
                'P-value': [p_value]
            })
            results = pd.concat([results, new_row], ignore_index=True)

# 顯示結果
print(results)

# 保存為 CSV 文件,如果需要保存結果到文件
output_file = '/content/drive/My Drive/Semicon_Analysis/pearson_correlation_results.csv'
results.to_csv(output_file, index=False)

  results = pd.concat([results, new_row], ignore_index=True)
    Company 1 Company 2  Pearson R        P-value
0         ADI      AMAT   0.539530   0.000000e+00
1         ADI       AMD   0.419531   0.000000e+00
2         ADI       ARM   0.430383   9.766949e-14
3         ADI      ASML   0.567905   0.000000e+00
4         ADI      AVGO   0.640933   0.000000e+00
..        ...       ...        ...            ...
185      SNPS      TSMC   0.065144   4.763340e-07
186      SNPS       TXN   0.402536  8.175838e-318
187       TEL      TSMC   0.395513  2.321578e-221
188       TEL       TXN   0.119600   1.692722e-20
189      TSMC       TXN   0.088416   7.882611e-12

[190 rows x 4 columns]

重新提供 Heatmap of Price Change% , 基於 P-value <0.05

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import numpy as np

# 定義公司簡稱
company_abbr = {
    'ASML Holding N.V.': 'ASML',
    'Advanced Micro Devices, Inc.': 'AMD',
    'Analog Devices, Inc.': 'ADI',
    'Applied Materials, Inc.': 'AMAT',
    'Arm Holdings plc': 'ARM',
    'Broadcom Inc.': 'AVGO',
    'Intel Corporation': 'INTC',
    'KLA Corporation': 'KLA',
    'Lam Research Corporation': 'LRCX',
    'Marvell Technology, Inc.': 'MRVL',
    'MediaTek Inc.': 'MTK',
    'Micron Technology, Inc.': 'MU',
    'NVIDIA Corporation': 'NVDA',
    'QUALCOMM Incorporated': 'QCOM',
    'SK hynix Inc.': 'SKH',
    'Samsung Electronics Co., Ltd.': 'SEC',
    'Synopsys, Inc.': 'SNPS',
    'Taiwan Semiconductor Manufacturing Company Limited': 'TSMC',
    'Texas Instruments Incorporated': 'TXN',
    'Tokyo Electron Limited': 'TEL'
}

# 讀取文件
input_file = '/content/drive/My Drive/Semicon_Analysis/semiconductor_merged_1016.csv'
df = pd.read_csv(input_file)

# 使用簡稱替換公司名稱
df['Company'] = df['Company Name'].replace(company_abbr)

# 計算每日的收盤價變動百分比(使用 Close_Price_Local)
df['Close_Price_Change%'] = df.groupby('Company Name')['Close Price_Local'].pct_change() * 100

# 透過 pivot_table 生成檢定所需的格式
pivot_data = df.pivot_table(index='Date', columns='Company', values='Close_Price_Change%')

# 創建一個空的 DataFrame 來存放結果
correlation_matrix = pd.DataFrame(np.zeros((len(pivot_data.columns), len(pivot_data.columns))),
                                  columns=pivot_data.columns, index=pivot_data.columns)

# 計算每兩家公司之間的 Pearson R 和 P 值
for i in range(len(pivot_data.columns)):
    for j in range(i+1, len(pivot_data.columns)):  # 避免重複計算相同的組合
        comp1 = pivot_data.columns[i]
        comp2 = pivot_data.columns[j]

        # 去除 NaN 值,避免影響計算
        valid_data = pivot_data[[comp1, comp2]].dropna()

        if not valid_data.empty:
            # 計算 Pearson 相關性和 P 值
            r_value, p_value = pearsonr(valid_data[comp1], valid_data[comp2])

            # 只在 p-value < 0.05 的情況下填入 Pearson R 值,否則設為 NaN
            if p_value < 0.05:
                correlation_matrix.loc[comp1, comp2] = r_value
                correlation_matrix.loc[comp2, comp1] = r_value
            else:
                correlation_matrix.loc[comp1, comp2] = np.nan
                correlation_matrix.loc[comp2, comp1] = np.nan

# 繪製相關性矩陣的熱力圖,只顯示符合條件的公司 P Value < 0.05
plt.figure(figsize=(16, 9))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5, mask=correlation_matrix.isnull())

# 設置標題和標籤
plt.title('Heatmap of Company Close Price Change % (P-value < 0.05)')
plt.xlabel('Company')
plt.ylabel('Company')

# 顯示圖形
plt.tight_layout()
plt.show()