SensitiveInformationDetection敏感信息检测工具（暴露面检测）_sensitive information types and pattern detection,

作者：程序质量控制师 | 2024-01-29 22:12:34

踩

sensitive information types and pattern detection, or regex patterns.

前言

前段时间在工作中碰到了一些需要大量人工检测的工作——暴露面检测，其最后一步要人工检测该页面是否存在敏感信息泄露问题，需要人工判断，遂写了一个自动检测网页敏感信息的脚本。

介绍

该脚本主要以中间件版本、其他版本、是否有源码泄露、敏感信息检测和是否存在下载行为这几方面来检测。

由于禁用了SSL证书验证，所以会存在一些安全性的问题，请酌情使用。

原理为爬取网页内容，使用正则表达式匹配关键词，如果有特殊关键字可以自行修改、添加正则表达式来完善成自己所想使用的代码。

中间件版本

主要从以下版本中检测（基本涵盖了市面上常见的中间件）

Tomcat
WebLogic
Jboss
Jetty
Webshere
Glassfish
Nginx
Apache
Microsoft IIS
Kafka
RabbitMQ
Redis
Elasticsearch
MongoDB
MySQL
Node.js
Express.js
Django
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

其他版本

主要是对【数字.数字】【数字.数字.数字】这两种格式的版本进行检测

是否有源码泄露

因为网页大部分是由HTML书写，所以在匹配时先匹配HTML将其剔除，在匹配其他语言（包括下面代码块这些），主要是从该语言中的常见语法单词匹配。

HTML

Python
JavaScript
Java
C++
Go
1
2
3
4
5
6
7

敏感信息检测

主要检测以.com 和.cn结尾的邮箱；以13、14、15、18和17开头的11位中国大陆手机号；以及中国大陆身份证号

是否存在下载行为

匹配抓到的数据包，响应包中存在httpd/unix-directory或者application即可能存在下载行为，如果遇到特殊的情况也可以自行添加

使用

在脚本中我引入了argparse模块，-h查看使用方法

在这里插入图片描述

只添加了一条URL检测和文件批量检测两种方式，也可以将扫描结果输出为文件，加入了线程更快的处理多条数据（默认为5）。

源代码

import argparse
import re
import requests
import threading
from tabulate import tabulate
import urllib3
from tqdm import tqdm

# 禁用SSL证书验证
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 编译正则表达式
REGEX_DICT = {
    'Tomcat': r'Apache\s*Tomcat/([\d\.]+)',
    'Weblogic': r'Oracle\s*WebLogic\s*Server/([\d\.]+)',
    'Jboss': r'JBoss/([\d\.]+)',
    'Jetty': r'Jetty/([\d\.]+)',
    'Webshere': r'IBM\s*WebSphere/([\d\.]+)',
    'Glassfish': r'GlassFish/([\d\.]+)',
    'Nginx': r'nginx/([\d\.]+)',
    'Apache': r'Apache/([\d\.]+)',
    'Microsoft IIS': r'Microsoft-IIS/([\d\.]+)',
    'Kafka': r'Apache\s*Kafka/([\d\.]+)',
    'RabbitMQ': r'RabbitMQ/([\d\.]+)',
    'Redis': r'Redis/([\d\.]+)',
    'Elasticsearch': r'Elasticsearch/([\d\.]+)',
    'MongoDB': r'MongoDB/([\d\.]+)',
    'MySQL': r'MySQL/([\d\.]+)',
    'Node.js': r'X-Powered-By: Express',
    'Express.js': r'X-Powered-By: Express',
    'Django': r'X-Powered-By: Django'
}
COMPILED_REGEX_DICT = {middleware: re.compile(regex, re.IGNORECASE) for middleware, regex in REGEX_DICT.items()}

SENSITIVE_INFO_REGEX_LIST = [
    r'([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.(cn|com))',
    r'((13|14|15|18|17)+[0-9]{9})',
    r'(\d{17}[\d|x]|\d{15})',
]

PROGRAMMING_LANGUAGES = {
    'HTML': '<html>|<!DOCTYPE',
    'Python': 'import\s+|def\s+|print\s*\(|from\s+',
    'JavaScript': 'function\s+|console\.',
    'Java': 'public\s+class\s+|import\s+java\.',
    'C++': '#include\s+<|using\s+namespace\s+std',
    'Go': 'go\s+'
}

# 爬取网页内容
def read_content(url):
    try:
        requests.packages.urllib3.disable_warnings()
        response = requests.get(url, verify=False, timeout=3)
        response.raise_for_status()
        content = response.text
        return content
    except requests.exceptions.RequestException as e:
        print(f"请求异常: {str(e)}")
        return "NONE"

# 查找中间件和其他版本
def find_versions(content, compiled_regex_dict):
    found_middleware = []
    found_other = []
    other_versions_regex = re.compile(r'(\d+(?:\.\d+){1,2})')

    for middleware, compiled_regex in compiled_regex_dict.items():
        matches = compiled_regex.findall(content)
        if matches:
            found_middleware.append(f"{middleware}{matches[0]}")
            break
    else:
        found_versions = other_versions_regex.findall(content)
        found_versions = [version for version in found_versions if re.match(r'^\d+(?:\.\d+){1,2}$', version)]
        if found_versions:
            found_other.extend(found_versions)

    return found_middleware, found_other

# 版本检测
def version_detection(content, compiled_regex_dict):
    middleware, other = [], []

    if content:
        middleware, other = find_versions(content, compiled_regex_dict)

    return middleware[0] if middleware else "NONE", other[0] if other else "NONE"

# 匹配网页内容中的编程语言
def match_programming_language(content):
    for language, pattern in PROGRAMMING_LANGUAGES.items():
        if re.search(pattern, content):
            if language == "HTML":
                return "NONE"
            else:
                return language
    
    return "NONE"

# 检测敏感信息
def check_sensitive_info(content):
    sensitive_info = []
    for regex in SENSITIVE_INFO_REGEX_LIST:
        matches = re.findall(regex, content)
        if matches:
            for match in matches:
                sensitive_info.extend(match)
    
    if len(sensitive_info) > 0:
        return "Possible"
    else:
        return "NONE"

#判断是否为下载链接 
def is_downloadable(url):
    try:
        r = requests.head(url, allow_redirects=True, verify=False)
        content_type = r.headers.get('content-type')
        if content_type and content_type.startswith('application'):
            return "Possible"
        elif content_type and content_type.startswith('httpd/unix-directory'):
            return "Possible"
    except requests.exceptions.RequestException as e:
        return "NONE"
    return "NONE"

# 输出结果
def output_results(output, output_file=None):
    headers = ["URL", "Middleware version", "Other version", "Source code leakage", "information leakage", "Download files?"]
    table = tabulate(output, headers, tablefmt='simple')
    if output_file:
        with open(output_file, 'w', newline='') as file:
            print(table)
            writer = file.write(table)
    else:
        print(table)

# 线程函数
def worker(url, COMPILED_REGEX_DICT):
    semaphore.acquire()  # 获取信号量，如果超过最大线程数量会阻塞
    try:
        content = read_content(url)
        middleware, other = version_detection(content, COMPILED_REGEX_DICT)
        language = match_programming_language(content)
        sensitive = check_sensitive_info(content)
        downloadable = is_downloadable(url)
        output.append([url, middleware, other, language, sensitive, downloadable])
    finally:
        semaphore.release()  # 释放信号量，以便其他线程可以获取


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='敏感信息检测')
    parser.add_argument('-u', '--url', type=str, help='URL')
    parser.add_argument('-f', '--file', type=str, help='输入文件')
    parser.add_argument('-o', '--output', type=str, help='输出文件')
    parser.add_argument('-t', '--thread', type=int, help='线程数量(默认为5)')
    args = parser.parse_args()

    output = []

    if args.url:
        content = read_content(args.url)
        middleware, other = version_detection(content, COMPILED_REGEX_DICT)
        language = match_programming_language(content)
        sensitive = check_sensitive_info(content)
        downloadable = is_downloadable(args.url)
        output.append([args.url, middleware, other, language, sensitive, downloadable])
        output_results(output, args.output)
    elif args.file:
        # 创建线程信号量
        if args.thread:
            thread = args.thread
        else:
            thread = 5
        semaphore = threading.BoundedSemaphore(thread)
        with open(args.file, 'r') as f:
            urls = f.read().splitlines()
        with tqdm(total=len(urls)) as pbar:
            threads = []
            for url in urls:
                t = threading.Thread(target=worker, args=(url, COMPILED_REGEX_DICT))
                t.start()
                pbar.update(1)
                threads.append(t)
            # 等待所有线程完成
            for t in threads:
                t.join()
        output_results(output, args.output)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

测试

在这里插入图片描述

测试案例中（192.168.164.134:8080是我自己搭建的web服务，使用的是Tomcat8.5.19）

第一个是不知名版本泄露

第二和第三个都是Tomcat版本泄露

第四个不知名版本泄露和JavaScript源码泄露

第五个是可能存在信息泄露，信息如下所示，包含了邮箱

123456

sjidfnj@qq.com

第六、七、八个是存在下载行为的响应包中存在httpd/unix-directory或者application

免责声明

本工具仅能在取得足够合法授权的企业安全建设中使用，在使用本工具过程中，您应确保自己所有行为符合当地的法律法规。如您在使用本工具的过程中存在任何非法行为，您将自行承担所有后果，本工具所有开发者和所有贡献者不承担任何法律及连带责任。除非您已充分阅读、完全理解并接受本协议所有条款，否则，请您不要使用本工具。您的使用行为或者您以其他任何明示或者默示方式表示接受本协议的，即视为您已阅读并同意本协议的约束。

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/45394