改写成果
今天的主要成果是利用chatgpt对我的xlsx的脂肪肝病人数据进行了替换,根据不同指标与是否患代谢性脂肪肝之间的关联,最终选定原始指标13个,分别为:性别(gender)、体检年龄(age)、丙氨酸氨基转移酶(ALT)、天门冬氨酸氨基转移酶(AST)、γ-谷氨酰基转移酶(GGT)、总胆红素(TBIL);尿酸(UA)、总胆固醇(TC);甘油三酯(TG)、腰围(WC)、体重指数(BMI)、是否有糖尿病、高密度脂蛋白胆固醇(HDL-C)。各个指标的参考值、异常值占比及均值标准差。
之前师姐给我们的数据是数字,也可以说是结构化数据吧,但是要构建时序的知识图谱还有很长的一段路要走,现在最关键的问题就是我们如何去构建这么的一个知识图谱,所以从今天开始也是做了一个尝试。
指标 | 参考值 | 异常占比 | 均值标准差 |
---|---|---|---|
性别(gender) | -- | -- | -- |
年龄(age) | -- | -- | 47.42±13.82 |
ALT | 0-40 | 12.7% | 25.67±23.8 |
AST | 0-40 | 3.6% | 21.8±14.78 |
GGT | 男: 11-50,女:7-32 | 17.4% | 30.16±33.58 |
TBIL | 3.4-17.1 | 31.6% | 15.64±6.15 |
UA | 男:150-38女:100-300 | 48.7% | 349.82±90.73 |
TC | 0-5.2 | 38.6% | 5.01±0.94 |
TG | 0-1.7 | 29.7% | 1.57±1.26 |
WC | 男:0-85,女:0-80 | 62.8% | 86.59±10.67 |
BMI | 0-23.9 | 40.3% | 24.82±3.64 |
是否有糖尿病 | 0 | 10.6% | -- |
HDL-C | 男:1.16-1.42,女:1.29-1.55 | 69.8% | 1.28±0.33 |
上面的表格就是对应的处理部分,然后应用chatgpt对我的数据进行了处理。
下面是我的python代码:
import openpyxl
# 定义文件名和输出文件名
input_file = '3 合并(脂肪肝+非脂肪肝)完整数据.xlsx'
output_file = 'output1.xlsx'
# 加载 Excel 文件
wb = openpyxl.load_workbook(input_file)
ws = wb.active
# 定义替换规则函数
replace_rules = {
4: lambda x: '正常' if x < 40 else ('偏高' if x < 80 else '过高'),
5: lambda x: '正常' if x < 40 else ('偏高' if x < 80 else '过高'),
6: lambda x: '正常' if (7 < x < 32) else ('偏高' if (32 <= x < 64) else '过高') if column_2_value == 1 else '正常' if (11 < x < 50) else ('偏高' if (50 <= x < 100) else '过高'),
7: lambda x: '正常' if x < 3.4 else ('偏高' if x < 17.1 else '过高'),
8: lambda x: '正常' if (100 < x < 300) else ('偏高' if (300 <= x < 600) else '过高') if column_2_value == 1 else '正常' if (150 < x < 380) else ('偏高' if (380 <= x < 500) else '过高'),
9: lambda x: '正常' if x < 5.2 else '偏高',
10: lambda x: '正常' if x < 1.7 else '偏高',
11: lambda x: '正常' if x < 80 else ('偏高' if (80 <= x < 90) else '过高') if column_2_value == 1 else '正常' if x < 85 else ('偏高' if (85 <= x < 95) else '过高'),
12: lambda x: '正常' if x < 23.9 else '偏高',
14: lambda x: '正常' if (1.29 < x < 1.55) else ('偏高' if (1.55 <= x < 1.89) else '过高') if column_2_value == 1 else '正常' if (1.16 < x < 1.42) else ('偏高' if (1.42 <= x < 1.77) else '过高'),
}
# 缓存需要替换的单元格的值
replace_values = []
for row in ws.iter_rows(min_row=5):
column_2_value = row[1].value
for cell in row[2:]:
column_index = cell.column
replace_func = replace_rules.get(column_index, lambda x: x)
replace_values.append((cell, replace_func(cell.value)))
# 批量替换数值
for cell, replace_value in replace_values:
cell.value = replace_value
# 保存修改后的 Excel 文件
wb.save(output_file)
openpyxl 是我第一次用这个包,字面意思,首先是读取文件中的内容,然后定义了一个替换的规则,然后因为要有根据性别的不同进行区分,所以我们进行了一个 if column_2_value == 1 判断在里面,然后下面就是替换了,对于最小行的处理,下面我将会展示我的第一版是根据列进行处理的,但是还是差了一步没有解决,所以才转到了从行这里入手处理,因为是可以获得行的数据所以更好的处理。
replace_func = replace_rules.get(column_index, lambda x: x)
replace_values.append((cell, replace_func(cell.value)))
首先是对规则的应用,然后缓存替换输出,十分的漂亮。
下面是从行入手的代码,也是可以值得借鉴。
import pandas as pd
def replace_column_values(filename, sheetname, column_name):
# 读取原始文件
df = pd.read_excel(filename, sheet_name=sheetname)
# 替换列值
df.loc[3:, column_index] = df.loc[3:, column_name].apply(lambda x: '正常' if x < 40 else ('偏高' if x < 80 else '过高'))
# 保存到新文件
new_filename = filename.replace('.xlsx', '_new1.xlsx')
df.to_excel(new_filename, index=False)
print(f"替换后的文件已保存为 {new_filename}")
# 使用示例:
filename = '3 合并(脂肪肝+非脂肪肝)完整数据.xlsx' # 替换前的文件名
sheetname = 'Sheet1' # 表格名(如果是默认的第一个表格,直接填写 Sheet1 即可)
column_index = 4 # 需要替换的列名
replace_column_values(filename, sheetname, column_index)
对每一列进行规则的替换,然后进行处理,上面是只处理一列的数据。
import pandas as pd
def replace_column_values(filename, sheetname, column_rules):
# 读取原始文件
df = pd.read_excel(filename, sheet_name=sheetname)
# 遍历列规则
for column_index, rule in column_rules.items():
column_name = df.columns[column_index]
print(column_name)
print(df[2])
# 替换列值
df.loc[4:, column_name] = df.loc[4:, column_name].apply(rule)
# 保存到新文件
new_filename = filename.replace('.xlsx', '_new.xlsx')
df.to_excel(new_filename, index=False)
print(f"替换后的文件已保存为 {new_filename}")
# 使用示例:
filename = '3 合并(脂肪肝+非脂肪肝)完整数据.xlsx' # 替换前的文件名
sheetname = 'Sheet1' # 表格名(如果是默认的第一个表格,直接填写 Sheet1 即可)
column_rules = {
3: lambda x: '正常' if x < 40 else ('偏高' if x < 80 else '过高'),
4: lambda x: '正常' if x < 40 else ('偏高' if x < 80 else '过高'),
5: lambda x: '正常' if (x > 7 and x < 32) else ('偏高' if (x >= 32 and x < 64) else '过高') i
6: lambda x: '正常' if x < 3.4 else ('偏高' if x < 17.1 else '过高'),
7: lambda x: '正常' if (x > 100 and x < 300) else ('偏高' if (x >= 300 and x < 600) else '过高')
8: lambda x: '正常' if x < 5.2 else '偏高',
9: lambda x: '正常' if x < 1.7 else '偏高',
10: lambda x: '正常' if x < 80 else ('偏高' if (x >= 80 and x < 90) else '过高')
11: lambda x, df: '正常' if x < 23.9 else '偏高',
13: lambda x, df: '正常' if (x > 1.29 and x < 1.55) else ('偏高' if (x >= 1.55 and x < 1.89) else '过高')
}
column_index = 3
replace_column_values(filename, sheetname, column_rules)
整个定义规则,然后是存储为csv文件,根据下面就可以转换
import pandas as pd
# 读取XLSX文件
df = pd.read_excel('output1.xlsx')
# 将数据保存为CSV文件
df.to_csv('disease.csv', index=False)
构建的实体: 病人, 年龄,性别, 丙氨酸氨基转移酶, 天门冬氨酸氨基转移酶, γ-谷氨酰基转移酶, 总胆红素, 尿酸, 总胆固醇, 甘油三酯, 腰围, 体重指数,高密度脂蛋白胆固醇, 糖尿病, 脂肪肝
CREATE (:Patient {id: 1, gender: 'Male', age: 25, waist: 80, bmi: 23, diabetes: false, color: 'blue'})
CREATE (:BiochemicalIndicator {name: 'ALT', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'AST', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'GGT', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'TBIL', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'UA', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'TC', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'TG', color: 'green'})
CREATE (:BiochemicalIndicator {name: 'HDL-C', color: 'green'})
CREATE (:FattyLiver {name: 'Fatty Liver', color: 'red'})
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'ALT'})
CREATE (p)-[:HAS_INDICATOR {value: 23}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'AST'})
CREATE (p)-[:HAS_INDICATOR {value: 25}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'GGT'})
CREATE (p)-[:HAS_INDICATOR {value: 88}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'TBIL'})
CREATE (p)-[:HAS_INDICATOR {value: 16.7}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'UA'})
CREATE (p)-[:HAS_INDICATOR {value: 457}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'TC'})
CREATE (p)-[:HAS_INDICATOR {value: 4.15}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'TG'})
CREATE (p)-[:HAS_INDICATOR {value: 0.96}]->(b)
MATCH (p:Patient {id: 1}), (b:BiochemicalIndicator {name: 'HDL-C'})
CREATE (p)-[:HAS_INDICATOR {value: 1.81}]->(b)
MATCH (p:Patient {id: 1}), (f:FattyLiver {name: 'Fatty Liver'})
CREATE (p)-[:HAS_LIVER_CONDITION {value: TRUE}]->(f)
导入csv文件,
注意csv文件要放在projects里面或者社区版的import里面
LOAD CSV WITH HEADERS FROM 'file:///data_disease.csv' AS row
FIELDTERMINATOR '\t'
WITH toInteger(row.id) AS id, toInteger(row.gender) AS gender, toInteger(row.age) AS age,
toFloat(row.ALT) AS ALT, toFloat(row.AST) AS AST, toFloat(row.GGT) AS GGT,
toFloat(row.TBIL) AS TBIL, toFloat(row.UA) AS UA, toFloat(row.TC) AS TC,
toFloat(row.TG) AS TG, toFloat(row.WC) AS WC, toFloat(row.BMI) AS BMI,
toInteger(row.diabetes) AS hasDiabetes, toFloat(row.HDLC) AS HDL, toBoolean(row.FattyLiver) AS hasFattyLiver
MERGE (p:Patient {id: id, gender: gender, age: age, WC: WC, BMI: BMI})
SET p.hasDiabetes = hasDiabetes
MERGE (a:Attribute {name: 'ALT'})
MERGE (b:Attribute {name: 'AST'})
MERGE (c:Attribute {name: 'GGT'})
MERGE (d:Attribute {name: 'TBIL'})
MERGE (e:Attribute {name: 'UA'})
MERGE (f:Attribute {name: 'TC'})
MERGE (g:Attribute {name: 'TG'})
MERGE (j:Attribute {name: 'HDL'})
MERGE (k:Attribute {name: 'FattyLiver'})
MERGE (p)-[:HAS_ATTRIBUTE {value: ALT}]->(a)
MERGE (p)-[:HAS_ATTRIBUTE {value: AST}]->(b)
MERGE (p)-[:HAS_ATTRIBUTE {value: GGT}]->(c)
MERGE (p)-[:HAS_ATTRIBUTE {value: TBIL}]->(d)
MERGE (p)-[:HAS_ATTRIBUTE {value: UA}]->(e)
MERGE (p)-[:HAS_ATTRIBUTE {value: TC}]->(f)
MERGE (p)-[:HAS_ATTRIBUTE {value: TG}]->(g)
MERGE (p)-[:HAS_ATTRIBUTE {value: HDL}]->(j)
MERGE (p)-[:HAS_ATTRIBUTE {value: hasFattyLiver}]->(k)