rpfm_lib/files/text/mod.rs
1//---------------------------------------------------------------------------//
2// Copyright (c) 2017-2026 Ismael Gutiérrez González. All rights reserved.
3//
4// This file is part of the Rusted PackFile Manager (RPFM) project,
5// which can be found here: https://github.com/Frodo45127/rpfm.
6//
7// This file is licensed under the MIT license, which can be found here:
8// https://github.com/Frodo45127/rpfm/blob/master/LICENSE.
9//---------------------------------------------------------------------------//
10
11//! Plain text file handling with encoding detection and format recognition.
12//!
13//! This module provides the [`Text`] type for working with plain text files in Total War
14//! PackFiles. It supports multiple encodings and automatically detects file formats based
15//! on extensions to enable syntax highlighting and validation in editors.
16//!
17//! # Supported Encodings
18//!
19//! - **ISO-8859-15**: Western European character set (legacy support)
20//! - **UTF-8**: Modern Unicode encoding (default)
21//! - **UTF-8 with BOM**: UTF-8 with Byte Order Mark
22//! - **UTF-16 LE**: UTF-16 Little Endian with BOM
23//!
24//! Encoding is automatically detected by examining Byte Order Marks (BOMs) and attempting
25//! to decode the data. When encoding, the original encoding is preserved.
26//!
27//! # Format Detection
28//!
29//! The module automatically detects file formats based on file extensions, enabling appropriate
30//! syntax highlighting and validation. Supported formats include Lua scripts, XML configuration,
31//! JSON data, shader code, and more.
32//!
33//! # Supported File Extensions
34//!
35//! The following table lists all file extensions recognized as text files:
36//!
37//! | ----------------------------- | ---------- | ------------------------------------------- |
38//! | Extension | Format | Description |
39//! | ----------------------------- | ---------- | ------------------------------------------- |
40//! | `.agf` | `Plain` | |
41//! | `.bat` | `Bat` | Windows batch script. |
42//! | `.battle_script` | `Lua` | Battle script in Lua. |
43//! | `.battle_speech_camera` | `Plain` | Camera settings for battle speeches. |
44//! | `.benchmark` | `Xml` | Benchmark settings. |
45//! | `.bob` | `Plain` | BoB settings file. |
46//! | `.cco` | `Plain` | |
47//! | `.cindyscene` | `Xml` | Cutscene editor data. |
48//! | `.cindyscenemanager` | `Xml` | Cutscene manager data. |
49//! | `.code-snippets` | `Json` | VSCode snippet file. |
50//! | `.code-workspace` | `Json` | VSCode workspace file. |
51//! | `.css` | `Css` | CSS stylesheet. |
52//! | `.csv` | `Plain` | Comma-separated values file. |
53//! | `.environment` | `Xml` | Environment settings. |
54//! | `.environment_group` | `Xml` | Environment group settings. |
55//! | `.environment_group.override` | `Xml` | Environment group overrides. |
56//! | `.fbx` | `Plain` | Autodesk FBX (text format). |
57//! | `.fx` | `Cpp` | DirectX effect file. |
58//! | `.fx_fragment` | `Cpp` | DirectX effect fragment. |
59//! | `.glsl` | `Cpp` | OpenGL shader source. |
60//! | `.h` | `Cpp` | C/C++ header file. |
61//! | `.hlsl` | `Hlsl` | High Level Shading Language. |
62//! | `.htm` | `Html` | HTML document. |
63//! | `.html` | `Html` | HTML document. |
64//! | `.inl` | `Cpp` | C++ inline file. |
65//! | `.json` | `Json` | JSON data file. |
66//! | `.js` | `Js` | JavaScript file. |
67//! | `.kfa` | `Xml` | Battle Audio Event file. |
68//! | `.kfc` | `Xml` | Battle Camera file. |
69//! | `.kfe` | `Xml` | Battle Effect file. |
70//! | `.kfe_temp` | `Xml` | Battle Effect (temporary). |
71//! | `.kfl` | `Xml` | Battle Point Light file. |
72//! | `.kfl_temp` | `Xml` | Battle Point Light (temporary). |
73//! | `.kfsl` | `Xml` | Battle Spot Light file. |
74//! | `.kfp` | `Xml` | Battle Prop file. |
75//! | `.kfcs` | `Xml` | Battle Composite Scene file. |
76//! | `.kfcs_temp` | `Xml` | Battle Composite Scene (temporary). |
77//! | `.ktr` | `Xml` | Battle Tracker file. |
78//! | `.ktr_temp` | `Xml` | Battle Tracker (temporary). |
79//! | `.lighting` | `Xml` | Lighting configuration. |
80//! | `.log` | `Plain` | Log file. |
81//! | `.lua` | `Lua` | Lua script file. |
82//! | `.material` | `Xml` | Material definition. |
83//! | `.md` | `Markdown` | Markdown documentation. |
84//! | `.model_statistics` | `Xml` | Model statistics data. |
85//! | `.mvscene` | `Xml` | Movie scene file. |
86//! | `.py` | `Python` | Python script. |
87//! | `.sbs` | `Xml` | Substance Designer file. |
88//! | `.shader` | `Xml` | Shader definition. |
89//! | `.sql` | `Sql` | SQL query file. |
90//! | `.tai` | `Plain` | |
91//! | `.technique` | `Xml` | Rendering technique definition. |
92//! | `.texture_array` | `Plain` | List of campaign map textures. |
93//! | `.tsv` | `Plain` | Tab-separated values file. |
94//! | `.twui` | `Lua` | Total War UI file (Lua format). |
95//! | `.txt` | `Plain` | Plain text file. |
96//! | `.xml` | `Xml` | XML file. |
97//! | `.xml_temp` | `Xml` | XML (temporary). |
98//! | `.xml.shader` | `Xml` | Shader metadata (XML). |
99//! | `.xml.material` | `Xml` | Material metadata (XML). |
100//! | `.xt` | `Plain` | Text file (typo variant). |
101//! | `.yml` | `Yaml` | YAML configuration file. |
102//! | `.yaml` | `Yaml` | YAML configuration file. |
103//!
104//! Note: `.variantmeshdefinition` and `.wsmodel` are also supported but listed separately in the code.
105
106use getset::*;
107use serde_derive::{Serialize, Deserialize};
108
109use std::io::SeekFrom;
110
111use crate::binary::{ReadBytes, WriteBytes};
112use crate::error::{Result, RLibError};
113use crate::files::{Decodeable, EncodeableExtraData, Encodeable};
114
115use super::DecodeableExtraData;
116
117/// UTF-8 BOM (Byte Order Mark).
118const BOM_UTF_8: [u8;3] = [0xEF,0xBB,0xBF];
119
120/// UTF-16 BOM (Byte Order Mark), Little Endian.
121const BOM_UTF_16_LE: [u8;2] = [0xFF,0xFE];
122
123/// List of extensions we recognize as `Text` files, with their respective known format.
124pub const EXTENSIONS: [(&str, TextFormat); 63] = [
125 (".agf", TextFormat::Plain),
126 (".bat", TextFormat::Bat),
127 (".battle_script", TextFormat::Lua),
128 (".battle_speech_camera", TextFormat::Plain),
129 (".benchmark", TextFormat::Xml),
130 (".bob", TextFormat::Plain),
131 (".cco", TextFormat::Plain),
132 (".cindyscene", TextFormat::Xml),
133 (".cindyscenemanager", TextFormat::Xml),
134 (".code-snippets", TextFormat::Json),
135 (".code-workspace", TextFormat::Json),
136 (".css", TextFormat::Css),
137 (".csv", TextFormat::Plain),
138 (".environment", TextFormat::Xml),
139 (".environment_group", TextFormat::Xml),
140 (".environment_group.override", TextFormat::Xml),
141 (".fbx", TextFormat::Plain),
142 (".fx", TextFormat::Cpp),
143 (".fx_fragment", TextFormat::Cpp),
144 (".glsl", TextFormat::Cpp),
145 (".h", TextFormat::Cpp),
146 (".hlsl", TextFormat::Hlsl),
147 (".htm", TextFormat::Html),
148 (".html", TextFormat::Html),
149 (".inl", TextFormat::Cpp),
150 (".json", TextFormat::Json),
151 (".js", TextFormat::Js),
152 (".kfa", TextFormat::Xml),
153 (".kfc", TextFormat::Xml),
154 (".kfe", TextFormat::Xml),
155 (".kfe_temp", TextFormat::Xml),
156 (".kfl", TextFormat::Xml),
157 (".kfl_temp", TextFormat::Xml),
158 (".kfsl", TextFormat::Xml),
159 (".kfp", TextFormat::Xml),
160 (".kfcs", TextFormat::Xml),
161 (".kfcs_temp", TextFormat::Xml),
162 (".ktr", TextFormat::Xml),
163 (".ktr_temp", TextFormat::Xml),
164 (".lighting", TextFormat::Xml),
165 (".log", TextFormat::Plain),
166 (".lua", TextFormat::Lua),
167 (".md", TextFormat::Markdown),
168 (".model_statistics", TextFormat::Xml),
169 (".mvscene", TextFormat::Xml),
170 (".py", TextFormat::Python),
171 (".sbs", TextFormat::Xml),
172 (".shader", TextFormat::Xml),
173 (".sql", TextFormat::Sql),
174 (".tai", TextFormat::Plain),
175 (".technique", TextFormat::Xml),
176 (".texture_array", TextFormat::Plain),
177 (".tsv", TextFormat::Plain),
178 (".twui", TextFormat::Lua),
179 (".txt", TextFormat::Plain),
180 (".xml", TextFormat::Xml),
181 (".xml_temp", TextFormat::Xml),
182 (".xml.shader", TextFormat::Xml),
183 (".xml.material", TextFormat::Xml),
184 (".xt", TextFormat::Plain),
185 (".yml", TextFormat::Yaml),
186 (".yaml", TextFormat::Yaml),
187 (".material", TextFormat::Xml), // This has to be under xml.material
188];
189
190/// Extension for VMD, or Variant Mesh Definitions.
191pub const EXTENSION_VMD: (&str, TextFormat) = (".variantmeshdefinition", TextFormat::Xml);
192
193/// Extension for WS Models.
194pub const EXTENSION_WSMODEL: (&str, TextFormat) = (".wsmodel", TextFormat::Xml);
195
196#[cfg(test)] mod text_test;
197
198//---------------------------------------------------------------------------//
199// Enum & Structs
200//---------------------------------------------------------------------------//
201
202/// In-memory representation of a decoded text file.
203///
204/// Stores the text contents along with encoding and format metadata. The encoding
205/// is preserved when re-encoding to maintain file compatibility.
206///
207/// # Fields
208///
209/// * `encoding` - Character encoding detected or specified for the file
210/// * `format` - File format detected from extension (for syntax highlighting)
211/// * `contents` - Decoded text contents as a UTF-8 Rust string
212///
213/// # Getters/Setters
214///
215/// All fields have public getters, mutable getters, and setters via the `getset` crate:
216/// - `encoding()`, `encoding_mut()`, `set_encoding()`
217/// - `format()`, `format_mut()`, `set_format()`
218/// - `contents()`, `contents_mut()`, `set_contents()`
219///
220/// # Example
221///
222/// ```ignore
223/// use rpfm_lib::files::{Decodeable, text::Text, DecodeableExtraData};
224/// use std::io::Cursor;
225///
226/// let data = b"Hello, World!";
227/// let mut reader = Cursor::new(data);
228/// let text = Text::decode(&mut reader, &None).unwrap();
229///
230/// assert_eq!(text.contents(), "Hello, World!");
231/// ```
232#[derive(Default, PartialEq, Eq, Clone, Debug, Getters, MutGetters, Setters, Serialize, Deserialize)]
233#[getset(get = "pub", get_mut = "pub", set = "pub")]
234pub struct Text {
235
236 /// Character encoding of the file.
237 encoding: Encoding,
238
239 /// Detected file format based on extension.
240 format: TextFormat,
241
242 /// Decoded text contents.
243 contents: String
244}
245
246/// Character encoding types supported for text files.
247///
248/// Different Total War games and file types use different encodings. This enum
249/// represents all encodings that rpfm_lib can read and write.
250///
251/// # Encoding Detection
252///
253/// Encodings are detected in the following order:
254/// 1. Check for UTF-8 BOM (`0xEF 0xBB 0xBF`)
255/// 2. Check for UTF-16 LE BOM (`0xFF 0xFE`)
256/// 3. Attempt UTF-8 decode without BOM
257/// 4. Attempt ISO-8859-1 decode
258///
259/// # Re-encoding
260///
261/// When a text file is saved, the original encoding is preserved to maintain
262/// compatibility with the game engine.
263#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
264pub enum Encoding {
265 /// ISO-8859-1 encoding (Western European, legacy support).
266 Iso8859_1,
267
268 /// UTF-8 encoding without BOM (default for new files).
269 Utf8,
270
271 /// UTF-8 encoding with BOM marker.
272 Utf8Bom,
273
274 /// UTF-16 Little Endian encoding with BOM marker.
275 Utf16Le,
276}
277
278/// File format types for syntax highlighting and validation.
279///
280/// Based on file extension, text files are classified into different formats.
281/// This allows text editors to apply appropriate syntax highlighting, code
282/// completion, and validation rules.
283///
284/// # Format Detection
285///
286/// Format is determined by matching the file extension against the [`EXTENSIONS`]
287/// table. If no match is found, defaults to [`TextFormat::Plain`].
288#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
289pub enum TextFormat {
290 /// Windows batch script (`.bat`).
291 Bat,
292
293 /// C++ code or GLSL shaders (`.cpp`, `.h`, `.glsl`, `.inl`, `.fx`).
294 Cpp,
295
296 /// HTML documents (`.html`, `.htm`).
297 Html,
298
299 /// HLSL shader code (`.hlsl`).
300 Hlsl,
301
302 /// JSON data files (`.json`, `.code-snippets`, `.code-workspace`).
303 Json,
304
305 /// JavaScript code (`.js`).
306 Js,
307
308 /// CSS stylesheets (`.css`).
309 Css,
310
311 /// Lua scripts (`.lua`, `.twui`, `.battle_script`).
312 Lua,
313
314 /// Markdown documentation (`.md`).
315 Markdown,
316
317 /// Plain text with no specific format (`.txt`, `.csv`, `.tsv`, `.log`, etc.).
318 Plain,
319
320 /// Python scripts (`.py`).
321 Python,
322
323 /// SQL queries (`.sql`).
324 Sql,
325
326 /// XML configuration and data files (`.xml`, `.kf*`, `.cindyscene`, etc.).
327 Xml,
328
329 /// YAML configuration files (`.yaml`, `.yml`).
330 Yaml,
331}
332
333//---------------------------------------------------------------------------//
334// Implementation of Text
335//---------------------------------------------------------------------------//
336
337/// Implementation of `Default` for `Encoding`.
338impl Default for Encoding {
339
340 /// This returns `Encoding::Utf8`, as it's our default encoding.
341 fn default() -> Self {
342 Encoding::Utf8
343 }
344}
345
346/// Implementation of `Default` for `TextFormat`.
347impl Default for TextFormat {
348
349 /// This returns `TextFormat::Plain`, as it's our default format.
350 fn default() -> Self {
351 TextFormat::Plain
352 }
353}
354
355impl Text {
356
357 /// Detects the character encoding of text data.
358 ///
359 /// Examines the data stream to determine its encoding by checking for Byte Order Marks
360 /// (BOMs) and attempting to decode as different encodings.
361 ///
362 /// # Detection Algorithm
363 ///
364 /// 1. **UTF-8 BOM**: Checks for `0xEF 0xBB 0xBF` at the start
365 /// 2. **UTF-16 LE BOM**: Checks for `0xFF 0xFE` at the start
366 /// 3. **UTF-8 without BOM**: Attempts to decode entire file as UTF-8
367 /// 4. **ISO-8859-1**: Attempts to decode as ISO-8859-1
368 ///
369 /// # Arguments
370 ///
371 /// * `data` - Reader positioned at the start of the text data
372 ///
373 /// # Returns
374 ///
375 /// The detected [`Encoding`], or an error if no supported encoding matches.
376 ///
377 /// # Errors
378 ///
379 /// Returns [`RLibError::DecodingTextUnsupportedEncodingOrNotATextFile`] if:
380 /// - The data cannot be decoded as any supported encoding
381 /// - The file is not actually a text file
382 ///
383 /// # Side Effects
384 ///
385 /// After detection, the reader is repositioned:
386 /// - After the BOM if one was found
387 /// - At the start if no BOM was found
388 pub fn detect_encoding<R: ReadBytes>(data: &mut R) -> Result<Encoding> {
389 let len = data.len()?;
390
391 // First, check for BOMs. 2 bytes for UTF-16 BOMs, 3 for UTF-8.
392 if len > 2 && data.read_slice(3, true)? == BOM_UTF_8 {
393 data.seek(SeekFrom::Start(3))?;
394 return Ok(Encoding::Utf8Bom)
395 }
396 else if len > 1 && data.read_slice(2, true)? == BOM_UTF_16_LE {
397 data.seek(SeekFrom::Start(2))?;
398 return Ok(Encoding::Utf16Le)
399 }
400
401 // If no BOM is found, we assume UTF-8 if it decodes properly.
402 else {
403 let utf8_string = data.read_string_u8(len as usize);
404 if utf8_string.is_ok() {
405 data.rewind()?;
406 return Ok(Encoding::Utf8)
407 }
408
409 data.rewind()?;
410 let iso_8859_1_string = data.read_string_u8_iso_8859_15(len as usize);
411 if iso_8859_1_string.is_ok() {
412 data.rewind()?;
413 return Ok(Encoding::Iso8859_1)
414 }
415 }
416
417 // If we reach this, we do not support the format.
418 data.rewind()?;
419 Err(RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)
420 }
421}
422
423impl Decodeable for Text {
424
425 fn decode<R: ReadBytes>(data: &mut R, extra_data: &Option<DecodeableExtraData>) -> Result<Self> {
426 let len = data.len()?;
427 let encoding = Self::detect_encoding(data)?;
428 let contents = match encoding {
429 Encoding::Iso8859_1 => data.read_string_u8_iso_8859_15(len as usize)
430 .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?,
431
432 Encoding::Utf8 |
433 Encoding::Utf8Bom => {
434 let curr_pos = data.stream_position()?;
435 data.read_string_u8((len - curr_pos) as usize)
436 .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?
437 },
438 Encoding::Utf16Le => {
439 let curr_pos = data.stream_position()?;
440 data.read_string_u16((len - curr_pos) as usize)
441 .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?
442 }
443 };
444
445 // Try to get the format of the file.
446 let format = match extra_data {
447 Some(extra_data) => match extra_data.file_name {
448 Some(file_name) => {
449 match EXTENSIONS.iter().find_map(|(extension, format)| if file_name.ends_with(extension) { Some(format) } else { None }) {
450 Some(format) => *format,
451 None => TextFormat::Plain,
452 }
453 }
454 None => TextFormat::Plain,
455 }
456
457 None => TextFormat::Plain,
458 };
459
460 Ok(Self {
461 encoding,
462 format,
463 contents,
464 })
465 }
466}
467
468impl Encodeable for Text {
469
470 fn encode<W: WriteBytes>(&mut self, buffer: &mut W, _extra_data: &Option<EncodeableExtraData>) -> Result<()> {
471 match self.encoding {
472 Encoding::Iso8859_1 => buffer.write_string_u8_iso_8859_1(&self.contents),
473 Encoding::Utf8 => buffer.write_string_u8(&self.contents),
474 Encoding::Utf8Bom => {
475 buffer.write_all(&BOM_UTF_8)?;
476 buffer.write_string_u8(&self.contents)
477 },
478
479 // For UTF-16 we always have to add the BOM. Otherwise we have no way to easily tell what this file is.
480 Encoding::Utf16Le => {
481 buffer.write_all(&BOM_UTF_16_LE)?;
482 buffer.write_string_u16(&self.contents)
483 },
484 }
485 }
486}