超星网页字体解密
2025-04-02 14:15:03 # 技术笔记

右键网页检查

右键网页检查

搜索页面和这个相关的从而定位到引入文件

定位到引入文件

查找,一眼Base64编码的字体文件,通过这个编码数据解码获得原字体文件

Base64编码的字体文件

找到之后进去查看

查看

找到了,把里面内容复制下来,掐头去尾,是这样的数据

base64编码

编写脚本进行解码,引号内填写base64编码数据去掉data:application/font-ttf;charset=utf-8;base64,的开头声明”

进行解码

1
2
3
4
5
6
7
8
9
10
import base64

# Base64编码的字符串
base64_string = "这里填写base64编码数据去掉data:application/font-ttf;charset=utf-8;base64,的开头声明"
# 解码Base64字符串
decoded_data = base64.b64decode(base64_string)

# 保存为.ttf文件
with open("chaoxing_font.ttf", "wb") as f:
f.write(decoded_data)

获得到base64的ttf文件结果

获得到base64的ttf文件结果

使用字体查看器查看字体 https://www.bejson.com/ui/font/

查看字体

接下来将ttf文件转换成xml文件(python需要安装fontTools)

1
2
3
4
5
6
7
8
9
10
from fontTools.ttLib import TTFont

# TTF 文件路径
ttf_path = r"D:\UserData\Desktop\chaoxing_font.ttf"
xml_output_path = r"D:\UserData\Desktop\chaoxing_font.xml"
# 加载字体文件
font = TTFont(ttf_path)
# 保存为 XML 文件
font.saveXML(xml_output_path)
print(f"解析完毕")

抽选字体对比一下映射结果对不对(超星的加密是修改了此字体图元数据,显示成未加密的字)

抽选字体

下载原来的字体文件(非超星加密后的文件)

源字体文件对应

源字体文件对应

超星加密后字体

超星加密后字体

也就是说原来的5148对应着57C3

对应

编写对比代码进行测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import xml.etree.ElementTree as ET
import hashlib
import json


def parse_glyphs(file_path):
"""
解析字体文件中的 TTGlyph 信息
"""
tree = ET.parse(file_path)
root = tree.getroot()

glyphs = {}

for glyph in root.findall(".//TTGlyph"):
name = glyph.get("name")
points = []
for pt in glyph.findall(".//pt"):
x = pt.get("x")
y = pt.get("y")
on = pt.get("on")
points.append(f"{x}{y}{on}")

# 生成轮廓的唯一哈希值
hash_value = hashlib.md5("".join(points).encode('utf-8')).hexdigest()

# 截取哈希值的 25-32 位来作为唯一标识
truncated_hash = hash_value[24:32]

glyphs[truncated_hash] = name # 使用截取后的哈希值作为键

return glyphs


def get_unicode_character(name):
"""
根据 glyph 名称(如 uni5148)获取对应汉字
"""
if name.startswith("uni"):
try:
unicode_value = int(name[3:], 16)
return chr(unicode_value)
except ValueError:
return None
return None


def build_mapping(xml_old_path, xml_cx_path):
"""
建立思源黑体和超星字体的对照关系
"""
old_glyphs = parse_glyphs(xml_old_path)
print(len(old_glyphs))
cx_glyphs = parse_glyphs(xml_cx_path)
print(len(cx_glyphs))

mapping = []

for cx_hash, cx_name in cx_glyphs.items():
if cx_hash in old_glyphs:
old_name = old_glyphs[cx_hash]
character = get_unicode_character(old_name)
if character: # 确保是有效汉字
mapping.append({
"chaoxing": cx_name,
"si_yuan": {
"siyuan_name": old_name,
"siyuan_name_value": character
}
})

return mapping


if __name__ == "__main__":
xml_old_path = r"D:\UserData\Desktop\思源黑体.xml"
xml_cx_path = r"D:\UserData\Desktop\chaoxing_font.xml"

result = build_mapping(xml_old_path, xml_cx_path)

# 输出到文件
with open("glyph_mapping.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)

# 打印部分结果
# print(json.dumps(result[:5], ensure_ascii=False, indent=4))

生成结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
[
{
"chaoxing": "uni57C2",
"si_yuan": {
"siyuan_name": "uni2FAF",
"siyuan_name_value": "⾯"
}
},
{
"chaoxing": "uni57E0",
"si_yuan": {
"siyuan_name": "uni5584",
"siyuan_name_value": "善"
}
},
{
"chaoxing": "uni580F",
"si_yuan": {
"siyuan_name": "uni4E16",
"siyuan_name_value": "世"
}
},
{
"chaoxing": "uni581D",
"si_yuan": {
"siyuan_name": "uni5BB3",
"siyuan_name_value": "害"
}
},
{
"chaoxing": "uni900B",
"si_yuan": {
"siyuan_name": "uni2F83",
"siyuan_name_value": "⾃"
}
}
]

我采用的字符串是

超星:下埂关于“好好埃生”的埄埆哪埇不埁准埅?

思源:下面关于“好好先生”的描述哪项不太准确?

结合对照表显示,发现字体字形数据并对不上,查看字体数据,针对“下“字进行分析,发现两边结果并对不上,结果是超星对于字体字形进行了更改,并不是简单的对比字符哈希值就可以对比出来的了。

对比效果1

查看对比效果

对比效果2

左侧为原版字体,右侧为学习通字体

对比效果3

百度到” I Am I“大佬的文章”从学习通复制文字乱码看前端版权保护“找到一定的思路是假设字符的边距是唯一的,好的,那么我们就拼接边距距离。得出以下代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import xml.etree.ElementTree as ET
import hashlib
import json


def parse_glyphs(file_path):
"""
解析字体文件中的 TTGlyph 信息,使用 xMin, yMin, xMax, yMax 作为唯一标识
"""
tree = ET.parse(file_path)
root = tree.getroot()

glyphs = {}

for glyph in root.findall(".//TTGlyph"):
name = glyph.get("name")

# 获取 xMin, yMin, xMax, yMax
xMin = glyph.get("xMin")
yMin = glyph.get("yMin")
xMax = glyph.get("xMax")
yMax = glyph.get("yMax")

# 使用这四个值拼接成唯一标识符
if xMin and yMin and xMax and yMax:
unique_key = f"{xMin}{yMin}{xMax}{yMax}"
glyphs[unique_key] = name # 用四个边界值作为唯一键,值为glyph名称

return glyphs
# def parse_glyphs(file_path):
# """
# 解析字体文件中的 TTGlyph 信息
# """
# tree = ET.parse(file_path)
# root = tree.getroot()
#
# glyphs = {}
#
# for glyph in root.findall(".//TTGlyph"):
# name = glyph.get("name")
# points = []
# for pt in glyph.findall(".//pt"):
# x = pt.get("x")
# y = pt.get("y")
# on = pt.get("on")
# points.append(f"{x}{y}{on}")
#
# # 生成轮廓的唯一哈希值
# hash_value = hashlib.md5("".join(points).encode('utf-8')).hexdigest()
# glyphs[hash_value] = name # 哈希值对应 glyph 名称
#
# return glyphs


def get_unicode_character(name):
"""
根据 glyph 名称(如 uni5148)获取对应汉字
"""
if name.startswith("uni"):
try:
unicode_value = int(name[3:], 16)
return chr(unicode_value)
except ValueError:
return None
return None


def build_mapping(xml_old_path, xml_cx_path):
"""
建立思源黑体和超星字体的对照关系
"""
old_glyphs = parse_glyphs(xml_old_path)
# print(len(old_glyphs))
cx_glyphs = parse_glyphs(xml_cx_path)
# print(len(cx_glyphs))
# print(cx_glyphs)
mapping = []

for cx_hash, cx_name in cx_glyphs.items():

if cx_hash in old_glyphs:
old_name = old_glyphs[cx_hash]
character = get_unicode_character(old_name)
if cx_name == 'uni5814':
print(cx_hash)
print(old_name)

if character: # 确保是有效汉字
mapping.append({
"chaoxing": cx_name,
"si_yuan" : {
"siyuan_name": old_name,
"siyuan_name_value": character
}
})

return mapping


if __name__ == "__main__":
xml_old_path = r"D:\UserData\Desktop\思源黑体.xml"
xml_cx_path = r"D:\UserData\Desktop\chaoxing_font.xml"

result = build_mapping(xml_old_path, xml_cx_path)

# 输出到文件
with open("glyph_mapping.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)

# 打印部分结果
# print(json.dumps(result[:5], ensure_ascii=False, indent=4))

再通过匹配结果进行查看数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json

# 读取json
def load_mapping(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)

# 获取字符对应的 uni 名称
def get_uni_name(character, mapping):
unicode_name = f"uni{ord(character):X}"
# print(unicode_name)
for entry in mapping:
if entry["chaoxing"] == unicode_name:
return entry
return None

# 解析字符串
def parse_code(code, mapping):
result = []
for char in code:
mapping_entry = get_uni_name(char, mapping)
if mapping_entry:
result.append({
"char": char,
"message": mapping_entry["si_yuan"]['siyuan_name_value']
})
else:
result.append({
"char": char,
"message": char
})
return result

# 测试代码
if __name__ == "__main__":
# 读取字形映射
glyph_mapping_file = "glyph_mapping.json"
mapping = load_mapping(glyph_mapping_file)
# 示例字符串
code = '下埂关于“好好埃生”的埄埆哪埇不埁准埅?'
# 解析字符串
parsed_result = parse_code(code, mapping)
# 输出解析结果
# for item in parsed_result:
# print(item)
print(f'超星字体:{code}')
siyuan_font = ''.join([item['message'] for item in parsed_result])
print(f'思源字体:{siyuan_font}')

得出结果

1
2
超星字体:下埂关于“好好埃生”的埄埆哪埇不埁准埅?
思源字体:下⾯关于“好好先生”的描述哪项不太准确?

在大佬的测试中,是可以确定90%左右的字符数据的。如果您不想看了,到这里就可以了,基本满足所有的效果了。

然后由于最近领导给我一些任务就是比较两个字符串的相似度,通过这个启发就想通过xy向量计算字符字形的相似度。得出以下代码,首先针对”下”字进行数据测试

  1. 归一化:将所有点归一化到相同的尺度。(如果不归一,DTW有要求长度一样,会报错)

    归一化点集(Normalization of points)是指将原始点集中的每个点的坐标变换到一个特定的标准范围,以消除由于坐标范围不同而引起的差异,从而使得数据的比较更加公正和一致。具体而言,在这段代码中,归一化的目标是将每个点的坐标缩放到 [0, 1] 的范围内。

    为什么要进行归一化?

    在计算点集之间的相似度时(如使用动态时间规整 DTW),不同的点集可能有不同的坐标范围或单位。如果不进行归一化,可能会因为坐标差异较大,导致计算出的相似度偏差较大。归一化的过程能够消除这种影响,让两个点集具有相同的尺度,从而公平地比较它们之间的相似性。

    举个例子:

    假设有一个点集:

    1
    points = [(10, 20), (30, 40), (50, 60), (70, 80)]

    经过归一化处理后:

    • 最小值:min_x = 10, min_y = 20
    • 最大值:max_x = 70, max_y = 80

    每个点将会变成:

    • (10, 20) 变成 (0, 0)
    • (30, 40) 变成 (0.333, 0.333)
    • (50, 60) 变成 (0.666, 0.666)
    • (70, 80) 变成 (1, 1)

    最终,这些点就会被归一化到 [0, 1] 的范围内,这样它们的尺度是一致的,适合用于后续的相似度计算。归一化的目的是消除不同点集之间的坐标尺度差异,使得不同的点集可以在相同的尺度下进行比较。通过这种方式,我们可以更加公平地计算它们之间的相似度,而不会因为坐标的差异导致错误的比较结果。

  2. 使用DTW进行点对齐:保持原有的DTW对齐方法。

    这里计算两个点集的相似度分数,通过DTW距离计算得出一个0~1的相似度分数。1完全相似,0完全不一样。

    函数使用 fastdtw 函数计算归一化后的两个点集之间的 DTW 距离。DTW 是一种衡量两组时间序列相似度的算法,常用于处理不等长、速度不同的序列数据。在这里,它也可以用于比较两个二维点集的相似度。

  3. 计算相似度:基于对齐后的点集计算相似度。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

# 假设我们已经有了两个字形的数据
ttglyph_superstar = [
(515, 695), (515, 517), (526, 530), (749, 421), (884, 320),
(838, 259), (731, 347), (515, 461), (515, -72), (445, -72),
(445, 695), (59, 695), (59, 762), (942, 762), (942, 695)
]

ttglyph_sourcehan = [
(515, 695), (515, 517), (526, 530), (618, 485), (720, 426),
(825, 364), (884, 320), (838, 259), (788, 300), (694, 359),
(606, 413), (515, 461), (515, -72), (445, -72), (445, 695),
(59, 695), (59, 762), (942, 762), (942, 695)
]

# 转换为numpy数组
points1 = np.array(ttglyph_superstar)
points2 = np.array(ttglyph_sourcehan)

def normalize_points(points):
"""
归一化点集
"""
if len(points) == 0: # 检查点集是否为空
return []

points = np.array(points) # 将点集转换为NumPy数组
min_x, min_y = np.min(points, axis=0)
max_x, max_y = np.max(points, axis=0)

# 防止除以零
if max_x == min_x:
max_x = min_x + 1
if max_y == min_y:
max_y = min_y + 1

normalized_points = (points - [min_x, min_y]) / [max_x - min_x, max_y - min_y]
return normalized_points

def calculate_similarity(points1, points2):
"""
使用DTW计算两个点集之间的相似度
"""
points1_normalized = normalize_points(points1)
points2_normalized = normalize_points(points2)

if len(points1_normalized) == 0 or len(points2_normalized) == 0:
return 0.0 # 如果任一点集为空,相似度为0
#distance 是 DTW 算法计算出来的总距离,表示两个点集的整体差异。
#path 是 DTW 算法找到的最佳对齐路径,指示了如何从 points1 映射到 points2。
distance, path = fastdtw(points1_normalized, points2_normalized, dist=euclidean)
# DTW 算法会计算出一组“对齐”路径,通过这个路径可以重新排列两个点集,使它们更好地对齐。根据 path 的内容,分别从 points1_normalized 和 points2_normalized 中提取对齐后的点集。
aligned_points1 = [points1_normalized[i] for i, _ in path]
aligned_points2 = [points2_normalized[j] for _, j in path]
# 计算对齐点之间的欧几里得距离,在最佳对齐下,每对点之间的差异。np.linalg.norm 计算的是两点之间的欧几里得距离
distances = [np.linalg.norm(np.array(p1) - np.array(p2)) for p1, p2 in zip(aligned_points1, aligned_points2)]
# 算出所有欧氏距离去平局书,得出平均欧氏距距离
average_distance = np.mean(distances)

similarity_score = 1 / (1 + average_distance)
return similarity_score


print(f"Similarity score: {calculate_similarity(points2,points1)}")

得出结果

1
Similarity score: 0.975700703557036

发现相似度还是很高的,这里是需要忽略字体的风格的,和笔画的这些。

好的,可以通过这种相似度算法去核对超星字体对应的元数据了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import xml.etree.ElementTree as ET
import json
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from tqdm import tqdm

def parse_glyphs(file_path):
"""
解析字体文件中的 TTGlyph 信息
"""
tree = ET.parse(file_path)
root = tree.getroot()

glyphs = {}

for glyph in root.findall(".//TTGlyph"):
name = glyph.get("name")
points = []
for pt in glyph.findall(".//pt"):
x = int(pt.get("x"))
y = int(pt.get("y"))
on = int(pt.get("on", 0)) # 默认值为0,如果不存在则设为0
points.append((x, y))

# 将点集转换为字符串,作为字典的键
key = str(points)
glyphs[key] = name

return glyphs

def get_unicode_character(name):
"""
根据 glyph 名称(如 uni5148)获取对应汉字
"""
if name.startswith("uni"):
try:
unicode_value = int(name[3:], 16)
return chr(unicode_value)
except ValueError:
return None
return None

def normalize_points(points):
"""
归一化点集
"""
if not points: # 检查点集是否为空
return []

points = np.array(points) # 将点集转换为NumPy数组
min_x, min_y = np.min(points, axis=0)
max_x, max_y = np.max(points, axis=0)

# 防止除以零
if max_x == min_x:
max_x = min_x + 1
if max_y == min_y:
max_y = min_y + 1

normalized_points = (points - [min_x, min_y]) / [max_x - min_x, max_y - min_y]
return normalized_points

def calculate_similarity(points1, points2):
"""
使用DTW计算两个点集之间的相似度
"""
points1_normalized = normalize_points(points1)
points2_normalized = normalize_points(points2)

if len(points1_normalized) == 0 or len(points2_normalized) == 0:
return 0.0 # 如果任一点集为空,相似度为0

distance, path = fastdtw(points1_normalized, points2_normalized, dist=euclidean)

aligned_points1 = [points1_normalized[i] for i, _ in path]
aligned_points2 = [points2_normalized[j] for _, j in path]

distances = [np.linalg.norm(np.array(p1) - np.array(p2)) for p1, p2 in zip(aligned_points1, aligned_points2)]
average_distance = np.mean(distances)

similarity_score = 1 / (1 + average_distance)
return similarity_score

def build_mapping(xml_old_path, xml_cx_path):
"""
建立思源黑体和超星字体的对照关系
"""
old_glyphs = parse_glyphs(xml_old_path)
print(f'思源字体:{len(old_glyphs)}')
cx_glyphs = parse_glyphs(xml_cx_path)
print(f'超星字体:{len(cx_glyphs)}')

mapping = []

total_combinations = len(old_glyphs) * len(cx_glyphs)
with tqdm(total=total_combinations, desc="Processing") as pbar:
for old_key, old_name in old_glyphs.items():
for cx_key, cx_name in cx_glyphs.items():
similarity = calculate_similarity(eval(old_key), eval(cx_key))
if similarity >= 0.9:
mapping.append({
"chaoxing": {
"cx_name": cx_name,
"cx_character": get_unicode_character(cx_name)
},
"si_yuan": {
"sy_name": old_name,
"sy_character": get_unicode_character(old_name)
},
"similarity": similarity
})
pbar.update(1)

return mapping

if __name__ == "__main__":
xml_old_path = r"D:\UserData\Desktop\思源黑体.xml"
xml_cx_path = r"D:\UserData\Desktop\chaoxing_font.xml"

result = build_mapping(xml_old_path, xml_cx_path)

# 输出到文件
with open("glyph_mapping2.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)

# print(json.dumps(result[:5], ensure_ascii=False, indent=4))

但是运行效果不如人意

运行效果

这么长的时间肯定是不能忍的,所有采用多线程的处理方式,cpu就应该忙起来了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from concurrent.futures import ProcessPoolExecutor, as_completed
import json
import numpy as np
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from tqdm import tqdm
import xml.etree.ElementTree as ET

# 其他函数不变,保持之前的代码
def calculate_similarity(points1, points2):
"""
使用DTW计算两个点集之间的相似度
"""
points1_normalized = normalize_points(points1)
points2_normalized = normalize_points(points2)

if len(points1_normalized) == 0 or len(points2_normalized) == 0:
return 0.0 # 如果任一点集为空,相似度为0

distance, path = fastdtw(points1_normalized, points2_normalized, dist=euclidean)

aligned_points1 = [points1_normalized[i] for i, _ in path]
aligned_points2 = [points2_normalized[j] for _, j in path]

distances = [np.linalg.norm(np.array(p1) - np.array(p2)) for p1, p2 in zip(aligned_points1, aligned_points2)]
average_distance = np.mean(distances)

similarity_score = 1 / (1 + average_distance)
return similarity_score

def normalize_points(points):
"""
归一化点集
"""
if not points: # 检查点集是否为空
return []

points = np.array(points) # 将点集转换为NumPy数组
min_x, min_y = np.min(points, axis=0)
max_x, max_y = np.max(points, axis=0)

# 防止除以零
if max_x == min_x:
max_x = min_x + 1
if max_y == min_y:
max_y = min_y + 1

normalized_points = (points - [min_x, min_y]) / [max_x - min_x, max_y - min_y]
return normalized_points

def parallel_calculate_similarity(old_key, old_name, cx_glyphs):
"""
并行计算相似度
"""
results = []
for cx_key, cx_name in cx_glyphs.items():
similarity = calculate_similarity(eval(old_key), eval(cx_key))
if similarity >= 0.9:
results.append({
"chaoxing": {
"cx_name": cx_name,
"cx_character": get_unicode_character(cx_name)
},
"si_yuan": {
"sy_name": old_name,
"sy_character": get_unicode_character(old_name)
},
"similarity": similarity
})
return results

def get_unicode_character(name):
"""
根据 glyph 名称(如 uni5148)获取对应汉字
"""
if name.startswith("uni"):
try:
unicode_value = int(name[3:], 16)
return chr(unicode_value)
except ValueError:
return None
return None

def parse_glyphs(file_path):
"""
解析字体文件中的 TTGlyph 信息
"""
tree = ET.parse(file_path)
root = tree.getroot()

glyphs = {}

for glyph in root.findall(".//TTGlyph"):
name = glyph.get("name")
points = []
for pt in glyph.findall(".//pt"):
x = int(pt.get("x"))
y = int(pt.get("y"))
on = int(pt.get("on", 0)) # 默认值为0,如果不存在则设为0
points.append((x, y))

# 将点集转换为字符串,作为字典的键
key = str(points)
glyphs[key] = name

return glyphs


def build_mapping_parallel(xml_old_path, xml_cx_path):
"""
并行建立思源黑体和超星字体的对照关系
"""
old_glyphs = parse_glyphs(xml_old_path)
print(f'思源字体:{len(old_glyphs)}')
cx_glyphs = parse_glyphs(xml_cx_path)
print(f'超星字体:{len(cx_glyphs)}')

mapping = []

# 使用进程池进行并行处理
with ProcessPoolExecutor() as executor:
futures = []
# 为每个思源字体字形提交任务
for old_key, old_name in old_glyphs.items():
futures.append(executor.submit(parallel_calculate_similarity, old_key, old_name, cx_glyphs))

# 通过 as_completed 获取计算结果
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
mapping.extend(future.result())

return mapping

if __name__ == "__main__":
xml_old_path = r"D:\UserData\Desktop\思源黑体.xml"
xml_cx_path = r"D:\UserData\Desktop\chaoxing_font.xml"

result = build_mapping_parallel(xml_old_path, xml_cx_path)

# 输出到文件
with open("glyph_mapping_parallel.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)

# 打印部分结果
print(json.dumps(result[:5], ensure_ascii=False, indent=4))

这样处理时间来到了半小时(不过cpu要满了),因为我要求把大于0.9的数据全弄出来了,所以会有很多重复的字形数据。这里还需要取出相似度最高的那一个字形数据。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import json

# 读取保存的结果文件并生成包含所有相似度最高数据的 high.json 文件
def find_most_similar_for_all(result_file="glyph_mapping_parallel.json", output_file="high.json"):
# 读取 JSON 数据
with open(result_file, "r", encoding="utf-8") as f:
data = json.load(f)

# 用于存储每个 chaoxing 对应的最相似的 si_yuan 对照项
highest_similarity_entries = {}

# 遍历所有条目,找出每个 chaoxing 字符对应的最相似的 si_yuan 对照项
for entry in data:
cx_name = entry["chaoxing"]["cx_name"]
similarity = entry["similarity"]

# 如果该 cx_name 没有出现过,或者当前相似度更高,更新最相似的条目
if cx_name not in highest_similarity_entries or similarity > highest_similarity_entries[cx_name]["similarity"]:
highest_similarity_entries[cx_name] = entry


# print(len(highest_similarity_entries))
# 将结果保存到 high.json 文件
with open(output_file, "w", encoding="utf-8") as f:
json.dump(list(highest_similarity_entries.values()), f, ensure_ascii=False, indent=4)

print(f"已将结果保存到 {output_file}")

# 调用函数,生成 high.json 文件
find_most_similar_for_all()

至此,我们以及彻底完成了映射表的制作。然后拿数据跑一下进行测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json


# 读取 high.json 文件并加载数据
def load_high_json(file_path="high.json"):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)


# 根据 high.json 匹配字符串中的每个字符,返回结果字符串
def match_string_with_high_json(code, high_json_data):
result = []

for char in code:
# 遍历 high.json 中的所有项,查找匹配的 cx_character
matched = False
for entry in high_json_data:
if entry["chaoxing"]["cx_character"] == char:
# 根据需要将匹配的结果拼接成字符串
result.append(entry["si_yuan"]["sy_character"]) # 使用 si_yuan 对应的字符
matched = True
break

if not matched:
# 如果没有找到匹配的项,保留原字符
result.append(char)

# 将匹配结果列表合并成一个字符串
return ''.join(result)


# 示例字符串
code = '下埂关于“好好埃生”的埄埆哪埇不埁准埅?'

# 加载 high.json 数据
high_json_data = load_high_json()

# 匹配字符串
result_string = match_string_with_high_json(code, high_json_data)
print(f'超星字体:{code}')
print(f'思源字体:{result_string}')

得出结果

1
2
超星字体:下埂关于“好好埃生”的埄埆哪埇不埁准埅?
思源字体:下⾯关于“好好先生”的描述哪项不太准确?

好的,已经可以了,这里关于超星字体的时候,有个疑问就是为什么每个页面加载页面的字体,不能拿到全部的,我这个不知道咋弄,很困扰我,希望有大佬可以帮忙解释一下。

至此,文章彻底结束。

参考文章:

关于超星学习通网页版字体加密分析 :https://www.52pojie.cn/thread-1631357-4-1.html

从学习通复制文字乱码看前端版权保护:https://5ime.cn/xxt_font.html