Skip to main content

rpfm_lib/files/text/
mod.rs

1//---------------------------------------------------------------------------//
2// Copyright (c) 2017-2026 Ismael Gutiérrez González. All rights reserved.
3//
4// This file is part of the Rusted PackFile Manager (RPFM) project,
5// which can be found here: https://github.com/Frodo45127/rpfm.
6//
7// This file is licensed under the MIT license, which can be found here:
8// https://github.com/Frodo45127/rpfm/blob/master/LICENSE.
9//---------------------------------------------------------------------------//
10
11//! Plain text file handling with encoding detection and format recognition.
12//!
13//! This module provides the [`Text`] type for working with plain text files in Total War
14//! PackFiles. It supports multiple encodings and automatically detects file formats based
15//! on extensions to enable syntax highlighting and validation in editors.
16//!
17//! # Supported Encodings
18//!
19//! - **ISO-8859-15**: Western European character set (legacy support)
20//! - **UTF-8**: Modern Unicode encoding (default)
21//! - **UTF-8 with BOM**: UTF-8 with Byte Order Mark
22//! - **UTF-16 LE**: UTF-16 Little Endian with BOM
23//!
24//! Encoding is automatically detected by examining Byte Order Marks (BOMs) and attempting
25//! to decode the data. When encoding, the original encoding is preserved.
26//!
27//! # Format Detection
28//!
29//! The module automatically detects file formats based on file extensions, enabling appropriate
30//! syntax highlighting and validation. Supported formats include Lua scripts, XML configuration,
31//! JSON data, shader code, and more.
32//!
33//! # Supported File Extensions
34//!
35//! The following table lists all file extensions recognized as text files:
36//!
37//! | ----------------------------- | ---------- | ------------------------------------------- |
38//! | Extension                     | Format     | Description                                 |
39//! | ----------------------------- | ---------- | ------------------------------------------- |
40//! | `.agf`                        | `Plain`    |                                             |
41//! | `.bat`                        | `Bat`      | Windows batch script.                       |
42//! | `.battle_script`              | `Lua`      | Battle script in Lua.                       |
43//! | `.battle_speech_camera`       | `Plain`    | Camera settings for battle speeches.        |
44//! | `.benchmark`                  | `Xml`      | Benchmark settings.                         |
45//! | `.bob`                        | `Plain`    | BoB settings file.                          |
46//! | `.cco`                        | `Plain`    |                                             |
47//! | `.cindyscene`                 | `Xml`      | Cutscene editor data.                       |
48//! | `.cindyscenemanager`          | `Xml`      | Cutscene manager data.                      |
49//! | `.code-snippets`              | `Json`     | VSCode snippet file.                        |
50//! | `.code-workspace`             | `Json`     | VSCode workspace file.                      |
51//! | `.css`                        | `Css`      | CSS stylesheet.                             |
52//! | `.csv`                        | `Plain`    | Comma-separated values file.                |
53//! | `.environment`                | `Xml`      | Environment settings.                       |
54//! | `.environment_group`          | `Xml`      | Environment group settings.                 |
55//! | `.environment_group.override` | `Xml`      | Environment group overrides.                |
56//! | `.fbx`                        | `Plain`    | Autodesk FBX (text format).                 |
57//! | `.fx`                         | `Cpp`      | DirectX effect file.                        |
58//! | `.fx_fragment`                | `Cpp`      | DirectX effect fragment.                    |
59//! | `.glsl`                       | `Cpp`      | OpenGL shader source.                       |
60//! | `.h`                          | `Cpp`      | C/C++ header file.                          |
61//! | `.hlsl`                       | `Hlsl`     | High Level Shading Language.                |
62//! | `.htm`                        | `Html`     | HTML document.                              |
63//! | `.html`                       | `Html`     | HTML document.                              |
64//! | `.inl`                        | `Cpp`      | C++ inline file.                            |
65//! | `.json`                       | `Json`     | JSON data file.                             |
66//! | `.js`                         | `Js`       | JavaScript file.                            |
67//! | `.kfa`                        | `Xml`      | Battle Audio Event file.                    |
68//! | `.kfc`                        | `Xml`      | Battle Camera file.                         |
69//! | `.kfe`                        | `Xml`      | Battle Effect file.                         |
70//! | `.kfe_temp`                   | `Xml`      | Battle Effect (temporary).                  |
71//! | `.kfl`                        | `Xml`      | Battle Point Light file.                    |
72//! | `.kfl_temp`                   | `Xml`      | Battle Point Light (temporary).             |
73//! | `.kfsl`                       | `Xml`      | Battle Spot Light file.                     |
74//! | `.kfp`                        | `Xml`      | Battle Prop file.                           |
75//! | `.kfcs`                       | `Xml`      | Battle Composite Scene file.                |
76//! | `.kfcs_temp`                  | `Xml`      | Battle Composite Scene (temporary).         |
77//! | `.ktr`                        | `Xml`      | Battle Tracker file.                        |
78//! | `.ktr_temp`                   | `Xml`      | Battle Tracker (temporary).                 |
79//! | `.lighting`                   | `Xml`      | Lighting configuration.                     |
80//! | `.log`                        | `Plain`    | Log file.                                   |
81//! | `.lua`                        | `Lua`      | Lua script file.                            |
82//! | `.material`                   | `Xml`      | Material definition.                        |
83//! | `.md`                         | `Markdown` | Markdown documentation.                     |
84//! | `.model_statistics`           | `Xml`      | Model statistics data.                      |
85//! | `.mvscene`                    | `Xml`      | Movie scene file.                           |
86//! | `.py`                         | `Python`   | Python script.                              |
87//! | `.sbs`                        | `Xml`      | Substance Designer file.                    |
88//! | `.shader`                     | `Xml`      | Shader definition.                          |
89//! | `.sql`                        | `Sql`      | SQL query file.                             |
90//! | `.tai`                        | `Plain`    |                                             |
91//! | `.technique`                  | `Xml`      | Rendering technique definition.             |
92//! | `.texture_array`              | `Plain`    | List of campaign map textures.              |
93//! | `.tsv`                        | `Plain`    | Tab-separated values file.                  |
94//! | `.twui`                       | `Lua`      | Total War UI file (Lua format).             |
95//! | `.txt`                        | `Plain`    | Plain text file.                            |
96//! | `.xml`                        | `Xml`      | XML file.                                   |
97//! | `.xml_temp`                   | `Xml`      | XML (temporary).                            |
98//! | `.xml.shader`                 | `Xml`      | Shader metadata (XML).                      |
99//! | `.xml.material`               | `Xml`      | Material metadata (XML).                    |
100//! | `.xt`                         | `Plain`    | Text file (typo variant).                   |
101//! | `.yml`                        | `Yaml`     | YAML configuration file.                    |
102//! | `.yaml`                       | `Yaml`     | YAML configuration file.                    |
103//!
104//! Note: `.variantmeshdefinition` and `.wsmodel` are also supported but listed separately in the code.
105
106use getset::*;
107use serde_derive::{Serialize, Deserialize};
108
109use std::io::SeekFrom;
110
111use crate::binary::{ReadBytes, WriteBytes};
112use crate::error::{Result, RLibError};
113use crate::files::{Decodeable, EncodeableExtraData, Encodeable};
114
115use super::DecodeableExtraData;
116
117/// UTF-8 BOM (Byte Order Mark).
118const BOM_UTF_8: [u8;3] = [0xEF,0xBB,0xBF];
119
120/// UTF-16 BOM (Byte Order Mark), Little Endian.
121const BOM_UTF_16_LE: [u8;2] = [0xFF,0xFE];
122
123/// List of extensions we recognize as `Text` files, with their respective known format.
124pub const EXTENSIONS: [(&str, TextFormat); 63] = [
125    (".agf", TextFormat::Plain),
126    (".bat", TextFormat::Bat),
127    (".battle_script", TextFormat::Lua),
128    (".battle_speech_camera", TextFormat::Plain),
129    (".benchmark", TextFormat::Xml),
130    (".bob", TextFormat::Plain),
131    (".cco", TextFormat::Plain),
132    (".cindyscene", TextFormat::Xml),
133    (".cindyscenemanager", TextFormat::Xml),
134    (".code-snippets", TextFormat::Json),
135    (".code-workspace", TextFormat::Json),
136    (".css", TextFormat::Css),
137    (".csv", TextFormat::Plain),
138    (".environment", TextFormat::Xml),
139    (".environment_group", TextFormat::Xml),
140    (".environment_group.override", TextFormat::Xml),
141    (".fbx", TextFormat::Plain),
142    (".fx", TextFormat::Cpp),
143    (".fx_fragment", TextFormat::Cpp),
144    (".glsl", TextFormat::Cpp),
145    (".h", TextFormat::Cpp),
146    (".hlsl", TextFormat::Hlsl),
147    (".htm", TextFormat::Html),
148    (".html", TextFormat::Html),
149    (".inl", TextFormat::Cpp),
150    (".json", TextFormat::Json),
151    (".js", TextFormat::Js),
152    (".kfa", TextFormat::Xml),
153    (".kfc", TextFormat::Xml),
154    (".kfe", TextFormat::Xml),
155    (".kfe_temp", TextFormat::Xml),
156    (".kfl", TextFormat::Xml),
157    (".kfl_temp", TextFormat::Xml),
158    (".kfsl", TextFormat::Xml),
159    (".kfp", TextFormat::Xml),
160    (".kfcs", TextFormat::Xml),
161    (".kfcs_temp", TextFormat::Xml),
162    (".ktr", TextFormat::Xml),
163    (".ktr_temp", TextFormat::Xml),
164    (".lighting", TextFormat::Xml),
165    (".log", TextFormat::Plain),
166    (".lua", TextFormat::Lua),
167    (".md", TextFormat::Markdown),
168    (".model_statistics", TextFormat::Xml),
169    (".mvscene", TextFormat::Xml),
170    (".py", TextFormat::Python),
171    (".sbs", TextFormat::Xml),
172    (".shader", TextFormat::Xml),
173    (".sql", TextFormat::Sql),
174    (".tai", TextFormat::Plain),
175    (".technique", TextFormat::Xml),
176    (".texture_array", TextFormat::Plain),
177    (".tsv", TextFormat::Plain),
178    (".twui", TextFormat::Lua),
179    (".txt", TextFormat::Plain),
180    (".xml", TextFormat::Xml),
181    (".xml_temp", TextFormat::Xml),
182    (".xml.shader", TextFormat::Xml),
183    (".xml.material", TextFormat::Xml),
184    (".xt", TextFormat::Plain),
185    (".yml", TextFormat::Yaml),
186    (".yaml", TextFormat::Yaml),
187    (".material", TextFormat::Xml),     // This has to be under xml.material
188];
189
190/// Extension for VMD, or Variant Mesh Definitions.
191pub const EXTENSION_VMD: (&str, TextFormat) = (".variantmeshdefinition", TextFormat::Xml);
192
193/// Extension for WS Models.
194pub const EXTENSION_WSMODEL: (&str, TextFormat) = (".wsmodel", TextFormat::Xml);
195
196#[cfg(test)] mod text_test;
197
198//---------------------------------------------------------------------------//
199//                              Enum & Structs
200//---------------------------------------------------------------------------//
201
202/// In-memory representation of a decoded text file.
203///
204/// Stores the text contents along with encoding and format metadata. The encoding
205/// is preserved when re-encoding to maintain file compatibility.
206///
207/// # Fields
208///
209/// * `encoding` - Character encoding detected or specified for the file
210/// * `format` - File format detected from extension (for syntax highlighting)
211/// * `contents` - Decoded text contents as a UTF-8 Rust string
212///
213/// # Getters/Setters
214///
215/// All fields have public getters, mutable getters, and setters via the `getset` crate:
216/// - `encoding()`, `encoding_mut()`, `set_encoding()`
217/// - `format()`, `format_mut()`, `set_format()`
218/// - `contents()`, `contents_mut()`, `set_contents()`
219///
220/// # Example
221///
222/// ```ignore
223/// use rpfm_lib::files::{Decodeable, text::Text, DecodeableExtraData};
224/// use std::io::Cursor;
225///
226/// let data = b"Hello, World!";
227/// let mut reader = Cursor::new(data);
228/// let text = Text::decode(&mut reader, &None).unwrap();
229///
230/// assert_eq!(text.contents(), "Hello, World!");
231/// ```
232#[derive(Default, PartialEq, Eq, Clone, Debug, Getters, MutGetters, Setters, Serialize, Deserialize)]
233#[getset(get = "pub", get_mut = "pub", set = "pub")]
234pub struct Text {
235
236    /// Character encoding of the file.
237    encoding: Encoding,
238
239    /// Detected file format based on extension.
240    format: TextFormat,
241
242    /// Decoded text contents.
243    contents: String
244}
245
246/// Character encoding types supported for text files.
247///
248/// Different Total War games and file types use different encodings. This enum
249/// represents all encodings that rpfm_lib can read and write.
250///
251/// # Encoding Detection
252///
253/// Encodings are detected in the following order:
254/// 1. Check for UTF-8 BOM (`0xEF 0xBB 0xBF`)
255/// 2. Check for UTF-16 LE BOM (`0xFF 0xFE`)
256/// 3. Attempt UTF-8 decode without BOM
257/// 4. Attempt ISO-8859-1 decode
258///
259/// # Re-encoding
260///
261/// When a text file is saved, the original encoding is preserved to maintain
262/// compatibility with the game engine.
263#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
264pub enum Encoding {
265    /// ISO-8859-1 encoding (Western European, legacy support).
266    Iso8859_1,
267
268    /// UTF-8 encoding without BOM (default for new files).
269    Utf8,
270
271    /// UTF-8 encoding with BOM marker.
272    Utf8Bom,
273
274    /// UTF-16 Little Endian encoding with BOM marker.
275    Utf16Le,
276}
277
278/// File format types for syntax highlighting and validation.
279///
280/// Based on file extension, text files are classified into different formats.
281/// This allows text editors to apply appropriate syntax highlighting, code
282/// completion, and validation rules.
283///
284/// # Format Detection
285///
286/// Format is determined by matching the file extension against the [`EXTENSIONS`]
287/// table. If no match is found, defaults to [`TextFormat::Plain`].
288#[derive(PartialEq, Eq, Clone, Copy, Debug, Serialize, Deserialize)]
289pub enum TextFormat {
290    /// Windows batch script (`.bat`).
291    Bat,
292
293    /// C++ code or GLSL shaders (`.cpp`, `.h`, `.glsl`, `.inl`, `.fx`).
294    Cpp,
295
296    /// HTML documents (`.html`, `.htm`).
297    Html,
298
299    /// HLSL shader code (`.hlsl`).
300    Hlsl,
301
302    /// JSON data files (`.json`, `.code-snippets`, `.code-workspace`).
303    Json,
304
305    /// JavaScript code (`.js`).
306    Js,
307
308    /// CSS stylesheets (`.css`).
309    Css,
310
311    /// Lua scripts (`.lua`, `.twui`, `.battle_script`).
312    Lua,
313
314    /// Markdown documentation (`.md`).
315    Markdown,
316
317    /// Plain text with no specific format (`.txt`, `.csv`, `.tsv`, `.log`, etc.).
318    Plain,
319
320    /// Python scripts (`.py`).
321    Python,
322
323    /// SQL queries (`.sql`).
324    Sql,
325
326    /// XML configuration and data files (`.xml`, `.kf*`, `.cindyscene`, etc.).
327    Xml,
328
329    /// YAML configuration files (`.yaml`, `.yml`).
330    Yaml,
331}
332
333//---------------------------------------------------------------------------//
334//                           Implementation of Text
335//---------------------------------------------------------------------------//
336
337/// Implementation of `Default` for `Encoding`.
338impl Default for Encoding {
339
340    /// This returns `Encoding::Utf8`, as it's our default encoding.
341    fn default() -> Self {
342        Encoding::Utf8
343    }
344}
345
346/// Implementation of `Default` for `TextFormat`.
347impl Default for TextFormat {
348
349    /// This returns `TextFormat::Plain`, as it's our default format.
350    fn default() -> Self {
351        TextFormat::Plain
352    }
353}
354
355impl Text {
356
357    /// Detects the character encoding of text data.
358    ///
359    /// Examines the data stream to determine its encoding by checking for Byte Order Marks
360    /// (BOMs) and attempting to decode as different encodings.
361    ///
362    /// # Detection Algorithm
363    ///
364    /// 1. **UTF-8 BOM**: Checks for `0xEF 0xBB 0xBF` at the start
365    /// 2. **UTF-16 LE BOM**: Checks for `0xFF 0xFE` at the start
366    /// 3. **UTF-8 without BOM**: Attempts to decode entire file as UTF-8
367    /// 4. **ISO-8859-1**: Attempts to decode as ISO-8859-1
368    ///
369    /// # Arguments
370    ///
371    /// * `data` - Reader positioned at the start of the text data
372    ///
373    /// # Returns
374    ///
375    /// The detected [`Encoding`], or an error if no supported encoding matches.
376    ///
377    /// # Errors
378    ///
379    /// Returns [`RLibError::DecodingTextUnsupportedEncodingOrNotATextFile`] if:
380    /// - The data cannot be decoded as any supported encoding
381    /// - The file is not actually a text file
382    ///
383    /// # Side Effects
384    ///
385    /// After detection, the reader is repositioned:
386    /// - After the BOM if one was found
387    /// - At the start if no BOM was found
388    pub fn detect_encoding<R: ReadBytes>(data: &mut R) -> Result<Encoding> {
389        let len = data.len()?;
390
391        // First, check for BOMs. 2 bytes for UTF-16 BOMs, 3 for UTF-8.
392        if len > 2 && data.read_slice(3, true)? == BOM_UTF_8 {
393            data.seek(SeekFrom::Start(3))?;
394            return Ok(Encoding::Utf8Bom)
395        }
396        else if len > 1 && data.read_slice(2, true)? == BOM_UTF_16_LE {
397            data.seek(SeekFrom::Start(2))?;
398            return Ok(Encoding::Utf16Le)
399        }
400
401        // If no BOM is found, we assume UTF-8 if it decodes properly.
402        else {
403            let utf8_string = data.read_string_u8(len as usize);
404            if utf8_string.is_ok() {
405                data.rewind()?;
406                return Ok(Encoding::Utf8)
407            }
408
409            data.rewind()?;
410            let iso_8859_1_string = data.read_string_u8_iso_8859_15(len as usize);
411            if iso_8859_1_string.is_ok() {
412                data.rewind()?;
413                return Ok(Encoding::Iso8859_1)
414            }
415        }
416
417        // If we reach this, we do not support the format.
418        data.rewind()?;
419        Err(RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)
420    }
421}
422
423impl Decodeable for Text {
424
425    fn decode<R: ReadBytes>(data: &mut R, extra_data: &Option<DecodeableExtraData>) -> Result<Self> {
426        let len = data.len()?;
427        let encoding = Self::detect_encoding(data)?;
428        let contents = match encoding {
429            Encoding::Iso8859_1 => data.read_string_u8_iso_8859_15(len as usize)
430                .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?,
431
432            Encoding::Utf8 |
433            Encoding::Utf8Bom => {
434                let curr_pos = data.stream_position()?;
435                data.read_string_u8((len - curr_pos) as usize)
436                    .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?
437            },
438            Encoding::Utf16Le => {
439                let curr_pos = data.stream_position()?;
440                data.read_string_u16((len - curr_pos) as usize)
441                    .map_err(|_| RLibError::DecodingTextUnsupportedEncodingOrNotATextFile)?
442            }
443        };
444
445        // Try to get the format of the file.
446        let format = match extra_data {
447            Some(extra_data) => match extra_data.file_name {
448                Some(file_name) => {
449                    match EXTENSIONS.iter().find_map(|(extension, format)| if file_name.ends_with(extension) { Some(format) } else { None }) {
450                        Some(format) => *format,
451                        None => TextFormat::Plain,
452                    }
453                }
454                None => TextFormat::Plain,
455            }
456
457            None => TextFormat::Plain,
458        };
459
460        Ok(Self {
461            encoding,
462            format,
463            contents,
464        })
465    }
466}
467
468impl Encodeable for Text {
469
470    fn encode<W: WriteBytes>(&mut self, buffer: &mut W, _extra_data: &Option<EncodeableExtraData>) -> Result<()> {
471        match self.encoding {
472            Encoding::Iso8859_1 => buffer.write_string_u8_iso_8859_1(&self.contents),
473            Encoding::Utf8 => buffer.write_string_u8(&self.contents),
474            Encoding::Utf8Bom => {
475                buffer.write_all(&BOM_UTF_8)?;
476                buffer.write_string_u8(&self.contents)
477            },
478
479            // For UTF-16 we always have to add the BOM. Otherwise we have no way to easily tell what this file is.
480            Encoding::Utf16Le => {
481                buffer.write_all(&BOM_UTF_16_LE)?;
482                buffer.write_string_u16(&self.contents)
483            },
484        }
485    }
486}