-
Notifications
You must be signed in to change notification settings - Fork 4
/
default.yaml
169 lines (142 loc) · 6.25 KB
/
default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
---
#################################################
## QAMYDATA: Health Checks for Your Data Files ##
#################################################
# Welcome to the default configuration (config) file for QAMYDATA.
# The file is written in YAML (YAML Ain't Markup Language), which is a human-readable language commonly used for configuration files.
# The config is divided into 4 types of tests: Basic File Checks, Metadata Checks, Data Integrity Checks and Disclosure Control Checks
#
# You can change anything inside double quotes ("like this")
# Lines starting with '#' are ignored
#######################
## Basic File Checks ##
#######################
basic_file_checks:
# Checks whether the file name contains illegal/odd/non-compliant characters
bad_filename:
setting: "^([a-zA-Z0-9]+)\\.([a-zA-Z0-9]+)$"
desc: "File name should match the user specified pattern"
#####################
## Metadata Checks ##
#####################
metadata:
# Checks high-level grouping (for example, useful if dataset can be grouped by household)
primary_variable:
setting: HouseholdID
desc: "Counts the unique occurrences for the grouping variable specified"
# Checks whether any variables do not have labels
missing_variable_labels:
setting: true
desc: "Variables should have a label"
# Checks whether any user-defined missing values do not have labels (sysmis) - SPSS only
value_defined_missing_no_label:
setting: true
desc: "User-defined missing values should have a label (SPSS only)"
# Checks whether any variable names and labels contain illegal/odd/non-compliant characters
variable_odd_characters:
setting:
- "&"
- "#"
- " "
- "@"
- "*"
- "ç"
- "ô"
- "ü"
desc: "Variable names and labels should not contain the specified characters"
# Checks whether any value labels contain illegal/odd/non-compliant characters
value_label_odd_characters:
setting:
- "&"
- "#"
- " "
- "@"
- "*"
- "ç"
- "ô"
- "ü"
desc: "Value labels should not contain the specified characters"
# Checks whether any variable labels exceed user-defined number of characters, e.g. 79
variable_label_max_length:
setting: 79
desc: "Variable labels should not exceed the defined number of characters"
# Checks whether any value labels exceed user-defined number of characters, e.g. 39
value_label_max_length:
setting: 39
desc: "Value labels should not exceed the defined number of characters"
# Checks variable labels for spelling errors using a user-defined dictionary file
# Please remember you must input the correct path to the dictionary file in order for the check to run on your data
# You can use more than one file.
variable_label_spellcheck:
setting:
- "/usr/share/dict/words"
- "C:\\path\\to\\dictonary\\file.txt"
desc: "Variable labels should have correct spelling"
# Checks value labels for spelling errors using a user-defined dictionary file
# Please remember you must input the correct path to the dictionary file in order for the check to run on your data
# You can use more than one file.
value_label_spellcheck:
setting:
- /usr/share/dict/words
- "C:\\path\\to\\dictonary\\file.txt"
desc: "Value labels should have correct spelling"
###########################
## Data Integrity Checks ##
###########################
data_integrity:
# Checks the user-specified variables for duplicate/repeated values (e.g. useful for checking duplicate IDs)
duplicate_values:
setting:
- Caseno
desc: "Variable should not contain duplicate/repeated values"
# Checks whether any string values contain illegal/odd/non-compliant characters
string_value_odd_characters:
setting:
- "&"
- "#"
- " "
- "@"
- "*"
- "ç"
- "ô"
- "ü"
desc: "String values should not contain the specified characters"
# Checks variable labels for spelling errors using a user-defined dictionary file
# Please remember you must input the correct path to the dictionary file in order for the check to run on your data
# You can use more than one file.
string_value_spellcheck:
setting:
- /usr/share/dict/words
- "C:\\path\\to\\dictonary\\file.txt"
desc: "String values should have correct spelling"
# Checks the percentage of undefined missing values ('sysmis')
system_missing_value_threshold:
setting: 25
desc: "Variable should not exceed the specified percentage of system missing values"
###############################
## Disclosure Control Checks ##
###############################
disclosure_risk:
# # Checks for user-defined patterns by using code for regular expressions (RegEx). Further info can be found in the User Guide.
# # This step is commented out from the default configuration file as it is highly dependent on user needs and can also be resource intensive.
# # The following examples show RegEx checks for e-mail addresses and UK mobile phone numbers.
# # To run this step for e-mail addresses and UK mobile phone numbers please delete the single hash sign (#) for the 5 lines below starting at 'regex patterns:'.
# # You can modify the RegEx with other expressions by adding them in the format below. Remember to comment out any checks (using #) that you do not wish to run!
# regex_patterns:
# setting:
# - "^([\\w\\.\\-]+)@([\\w\\-]+)((\\.(\\w){2,4})+)$", # checks for e-mail addresses
# - "^(\+44\s?7\d{3}|\(?07\d{3}\)?)\s?\d{3}\s?\d{3}$" # checks for UK mobile numbers
# desc: "Variable should not contain the user-specified RegEx pattern"
# Checks whether variables contain unique values or number of observations below the set threshold
unique_values:
setting: 1
desc: "Variable should not contain number of observations equal to or less than specified threshold"
# Checks string values for words listed in a user defined dictionary. If a
# word in the dictionary is found, it fails the check.
# Please remember you must input the correct path to the dictionary file in
# order for the check to run correctly on your data
# You can use more than one file.
string_value_stopword:
setting:
- "stopword.txt"
desc: "String values should not contain user defined values"