/src/osquery/osquery/tables/system/hash.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /** |
2 | | * Copyright (c) 2014-present, The osquery authors |
3 | | * |
4 | | * This source code is licensed as defined by the LICENSE file found in the |
5 | | * root directory of this source tree. |
6 | | * |
7 | | * SPDX-License-Identifier: (Apache-2.0 OR GPL-2.0-only) |
8 | | */ |
9 | | |
10 | | // clang-format off |
11 | | #include <sys/types.h> |
12 | | #include <sys/stat.h> |
13 | | // clang-format on |
14 | | |
15 | | #ifndef WIN32 |
16 | | #include <unistd.h> |
17 | | #endif |
18 | | |
19 | | #include <set> |
20 | | #include <thread> |
21 | | |
22 | | #include <boost/filesystem.hpp> |
23 | | |
24 | | #include <osquery/core/flags.h> |
25 | | #include <osquery/filesystem/filesystem.h> |
26 | | #include <osquery/hashing/hashing.h> |
27 | | #include <osquery/logger/logger.h> |
28 | | #include <osquery/core/tables.h> |
29 | | #include <osquery/sql/dynamic_table_row.h> |
30 | | #include <osquery/utils/mutex.h> |
31 | | #include <osquery/utils/info/platform_type.h> |
32 | | #include <osquery/worker/ipc/platform_table_container_ipc.h> |
33 | | #include <osquery/worker/logging/glog/glog_logger.h> |
34 | | #include <osquery/worker/logging/logger.h> |
35 | | |
36 | | namespace osquery { |
37 | | |
38 | | FLAG(bool, |
39 | | disable_hash_cache, |
40 | | false, |
41 | | "Cache calculated file hashes, re-calculate only if inode times change"); |
42 | | |
43 | | FLAG(uint32, hash_cache_max, 500, "Size of LRU file hash cache"); |
44 | | |
45 | | HIDDEN_FLAG(uint32, |
46 | | hash_delay, |
47 | | 20, |
48 | | "Number of milliseconds to delay after hashing"); |
49 | | |
50 | | DECLARE_uint64(read_max); |
51 | | |
52 | | namespace tables { |
53 | | |
54 | | /// Clear this amount of rows every time cache eviction is triggered. |
55 | | const size_t kHashCacheEvictSize{5}; |
56 | | |
57 | | /** |
58 | | * @brief Implements persistent in-memory caching of files' hashes. |
59 | | * |
60 | | * This cache has LRU eviction policy. The hash is recalculated |
61 | | * every time the mtime or size of the file changes. |
62 | | */ |
63 | | struct FileHashCache { |
64 | | /// The file's modification time, changes with a touch. |
65 | | time_t file_mtime; |
66 | | |
67 | | /// The file's serial or information number (inode). |
68 | | ino_t file_inode; |
69 | | |
70 | | /// The file's size. |
71 | | off_t file_size; |
72 | | |
73 | | /// For eviction, the last time this cache item was used. |
74 | | time_t cache_access_time; |
75 | | |
76 | | /// Cache content, the hashes. |
77 | | MultiHashes hashes; |
78 | | |
79 | | /// Cache index, the file path. |
80 | | std::string path; |
81 | | |
82 | | /// Comparison function for organizing the LRU heap. |
83 | 0 | static bool greater(const FileHashCache* l, const FileHashCache* r) { |
84 | 0 | return l->cache_access_time > r->cache_access_time; |
85 | 0 | } |
86 | | |
87 | | /** |
88 | | * @brief Do-it-all access function. |
89 | | * |
90 | | * Maintains the cache of hash sums, stats file at path, if it has changed or |
91 | | * it is not present in cache calculates the hashes and caches the result. |
92 | | * |
93 | | * @param path the path of file to hash. |
94 | | * @param out stores the calculated hashes. |
95 | | * |
96 | | * @return true if succeeded, false if something went wrong. |
97 | | */ |
98 | | static bool load(const std::string& path, MultiHashes& out, Logger& logger); |
99 | | }; |
100 | | |
101 | | #if defined(WIN32) |
102 | | |
103 | | #define stat _stat |
104 | | #define strerror_r(e, buf, sz) strerror_s((buf), (sz), (e)) |
105 | | |
106 | | #endif |
107 | | |
108 | | /** |
109 | | * @brief Checks the current stat output against the cached view. |
110 | | * |
111 | | * If the modified/altered time or the file's inode has changed then the hash |
112 | | * should be recalculated. |
113 | | */ |
114 | 0 | static inline bool statInvalid(const struct stat& st, const FileHashCache& fh) { |
115 | 0 | if (st.st_ino != fh.file_inode || st.st_mtime != fh.file_mtime) { |
116 | | // Most plausible case for modification detection. |
117 | 0 | return true; |
118 | 0 | } |
119 | | |
120 | 0 | if (st.st_size != fh.file_size) { |
121 | | // Just in case there's tomfoolery. |
122 | 0 | return true; |
123 | 0 | } |
124 | 0 | return false; |
125 | 0 | } |
126 | | |
127 | | bool FileHashCache::load(const std::string& path, |
128 | | MultiHashes& out, |
129 | 0 | Logger& logger) { |
130 | | // synchronize the access to cache |
131 | 0 | static Mutex mx; |
132 | | // path => cache entry |
133 | 0 | static std::unordered_map<std::string, FileHashCache> cache; |
134 | | // minheap on cache_access_time |
135 | 0 | static std::vector<FileHashCache*> lru; |
136 | |
|
137 | 0 | WriteLock guard(mx); |
138 | |
|
139 | 0 | struct stat st; |
140 | 0 | if (stat(path.c_str(), &st) != 0) { |
141 | 0 | char buf[0x200] = {0}; |
142 | 0 | strerror_r(errno, buf, sizeof(buf)); |
143 | 0 | logger.log(google::GLOG_WARNING, "Cannot stat file: " + path + ": " + buf); |
144 | 0 | return false; |
145 | 0 | } |
146 | | |
147 | 0 | auto entry = cache.find(path); |
148 | 0 | if (entry == cache.end()) { // none, load |
149 | 0 | if (cache.size() >= FLAGS_hash_cache_max) { |
150 | | // too large, evict |
151 | 0 | for (size_t i = 0; i < kHashCacheEvictSize; ++i) { |
152 | 0 | if (lru.empty()) { |
153 | 0 | continue; |
154 | 0 | } |
155 | 0 | std::string key = lru[0]->path; |
156 | 0 | std::pop_heap(lru.begin(), lru.end(), FileHashCache::greater); |
157 | 0 | lru.pop_back(); |
158 | 0 | if (cache.find(key) != cache.end()) { |
159 | 0 | cache.erase(key); |
160 | 0 | } |
161 | 0 | } |
162 | 0 | } |
163 | |
|
164 | 0 | auto hashes = hashMultiFromFile( |
165 | 0 | HASH_TYPE_MD5 | HASH_TYPE_SHA1 | HASH_TYPE_SHA256, path); |
166 | 0 | FileHashCache rec = {st.st_mtime, // .file_mtime |
167 | 0 | st.st_ino, // .file_inode |
168 | 0 | st.st_size, // .file_size |
169 | 0 | time(nullptr), // .cache_access_time |
170 | 0 | std::move(hashes), // .hashes |
171 | 0 | path}; // .path |
172 | 0 | cache[path] = std::move(rec); |
173 | 0 | lru.push_back(&cache[path]); |
174 | 0 | std::push_heap(lru.begin(), lru.end(), FileHashCache::greater); |
175 | 0 | out = cache[path].hashes; |
176 | 0 | } else if (statInvalid(st, entry->second)) { // changed, update |
177 | 0 | auto hashes = hashMultiFromFile( |
178 | 0 | HASH_TYPE_MD5 | HASH_TYPE_SHA1 | HASH_TYPE_SHA256, path); |
179 | 0 | entry->second.cache_access_time = time(nullptr); |
180 | 0 | entry->second.file_inode = st.st_ino; |
181 | 0 | entry->second.file_mtime = st.st_mtime; |
182 | 0 | entry->second.file_size = st.st_size; |
183 | 0 | entry->second.hashes = std::move(hashes); |
184 | 0 | std::make_heap(lru.begin(), lru.end(), FileHashCache::greater); |
185 | 0 | out = entry->second.hashes; |
186 | 0 | } else { // ok, got it |
187 | 0 | out = entry->second.hashes; |
188 | 0 | entry->second.cache_access_time = time(nullptr); |
189 | 0 | std::make_heap(lru.begin(), lru.end(), FileHashCache::greater); |
190 | 0 | } |
191 | 0 | return true; |
192 | 0 | } |
193 | | |
194 | | void genHashForFile(const std::string& path, |
195 | | const std::string& dir, |
196 | | QueryContext& context, |
197 | | QueryData& results, |
198 | 0 | Logger& logger) { |
199 | | // Must provide the path, filename, directory separate from boost path->string |
200 | | // helpers to match any explicit (query-parsed) predicate constraints. |
201 | 0 | auto tr = TableRowHolder(new DynamicTableRow()); |
202 | 0 | MultiHashes hashes; |
203 | 0 | if (!FLAGS_disable_hash_cache) { |
204 | 0 | FileHashCache::load(path, hashes, logger); |
205 | 0 | } else { |
206 | 0 | if (context.isCached(path)) { |
207 | | // Use the inner-query cache if the global hash cache is disabled. |
208 | | // This protects against hashing the same content twice in the same query. |
209 | 0 | tr = context.getCache(path); |
210 | 0 | } else { |
211 | 0 | hashes = hashMultiFromFile( |
212 | 0 | HASH_TYPE_MD5 | HASH_TYPE_SHA1 | HASH_TYPE_SHA256, path); |
213 | 0 | std::this_thread::sleep_for(std::chrono::milliseconds(FLAGS_hash_delay)); |
214 | 0 | } |
215 | 0 | } |
216 | |
|
217 | 0 | DynamicTableRow& r = *dynamic_cast<DynamicTableRow*>(tr.get()); |
218 | 0 | r["path"] = path; |
219 | 0 | r["directory"] = dir; |
220 | 0 | r["md5"] = std::move(hashes.md5); |
221 | 0 | r["sha1"] = std::move(hashes.sha1); |
222 | 0 | r["sha256"] = std::move(hashes.sha256); |
223 | |
|
224 | 0 | if (FLAGS_disable_hash_cache) { |
225 | 0 | context.setCache(path, tr); |
226 | 0 | } |
227 | |
|
228 | 0 | r["pid_with_namespace"] = "0"; |
229 | |
|
230 | 0 | results.push_back(static_cast<Row>(r)); |
231 | 0 | } |
232 | | |
233 | | void expandFSPathConstraints(QueryContext& context, |
234 | | const std::string& path_column_name, |
235 | 0 | std::set<std::string>& paths) { |
236 | 0 | context.expandConstraints( |
237 | 0 | path_column_name, |
238 | 0 | LIKE, |
239 | 0 | paths, |
240 | 0 | ([&](const std::string& pattern, std::set<std::string>& out) { |
241 | 0 | std::vector<std::string> patterns; |
242 | 0 | auto status = |
243 | 0 | resolveFilePattern(pattern, patterns, GLOB_ALL | GLOB_NO_CANON); |
244 | 0 | if (status.ok()) { |
245 | 0 | for (const auto& resolved : patterns) { |
246 | 0 | out.insert(resolved); |
247 | 0 | } |
248 | 0 | } |
249 | 0 | return status; |
250 | 0 | })); |
251 | 0 | } |
252 | | |
253 | 0 | QueryData genHashImpl(QueryContext& context, Logger& logger) { |
254 | 0 | QueryData results; |
255 | 0 | boost::system::error_code ec; |
256 | | |
257 | | // The query must provide a predicate with constraints including path or |
258 | | // directory. We search for the parsed predicate constraints with the equals |
259 | | // operator. |
260 | 0 | auto paths = context.constraints["path"].getAll(EQUALS); |
261 | 0 | expandFSPathConstraints(context, "path", paths); |
262 | | |
263 | | // Iterate through the file paths, adding the hash results |
264 | 0 | for (const auto& path_string : paths) { |
265 | 0 | boost::filesystem::path path = path_string; |
266 | 0 | if (!boost::filesystem::is_regular_file(path, ec)) { |
267 | 0 | continue; |
268 | 0 | } |
269 | | |
270 | 0 | genHashForFile( |
271 | 0 | path_string, path.parent_path().string(), context, results, logger); |
272 | 0 | } |
273 | | |
274 | | // Now loop through constraints using the directory column constraint. |
275 | 0 | auto directories = context.constraints["directory"].getAll(EQUALS); |
276 | 0 | expandFSPathConstraints(context, "directory", directories); |
277 | | |
278 | | // Iterate over the directory paths |
279 | 0 | for (const auto& directory_string : directories) { |
280 | 0 | boost::filesystem::path directory = directory_string; |
281 | 0 | if (!boost::filesystem::is_directory(directory, ec)) { |
282 | 0 | continue; |
283 | 0 | } |
284 | | |
285 | | // Iterate over the directory files and generate a hash for each regular |
286 | | // file. |
287 | 0 | boost::filesystem::directory_iterator begin(directory), end; |
288 | 0 | for (; begin != end; ++begin) { |
289 | 0 | if (boost::filesystem::is_regular_file(begin->path(), ec)) { |
290 | 0 | genHashForFile( |
291 | 0 | begin->path().string(), directory_string, context, results, logger); |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | |
|
296 | 0 | return results; |
297 | 0 | } |
298 | | |
299 | 0 | QueryData genHash(QueryContext& context) { |
300 | 0 | if (hasNamespaceConstraint(context)) { |
301 | 0 | return generateInNamespace(context, "hash", genHashImpl); |
302 | 0 | } else { |
303 | 0 | GLOGLogger logger; |
304 | 0 | return genHashImpl(context, logger); |
305 | 0 | } |
306 | 0 | } |
307 | | } // namespace tables |
308 | | } // namespace osquery |