blob: 7743ea8e5a85e298d85987224ae8176e4b5fb11b [file] [log] [blame]
.\" -----------------------------------------------------------------------
.\"
.\" Copyright 2005 H. Peter Anvin - All Rights Reserved
.\"
.\" Permission is hereby granted, free of charge, to any person
.\" obtaining a copy of this software and associated documentation
.\" files (the "Software"), to deal in the Software without
.\" restriction, including without limitation the rights to use,
.\" copy, modify, merge, publish, distribute, sublicense, and/or
.\" sell copies of the Software, and to permit persons to whom
.\" the Software is furnished to do so, subject to the following
.\" conditions:
.\"
.\" The above copyright notice and this permission notice shall
.\" be included in all copies or substantial portions of the Software.
.\"
.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
.\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
.\" OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
.\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
.\" HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
.\" WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
.\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
.\" OTHER DEALINGS IN THE SOFTWARE.
.\"
.\" -----------------------------------------------------------------------
.TH LIBUCD 3 2005-12-29 libucd ""
.SH NAME
unicode_character_data, unicode_character_lookup, unicode_character_get,
unicode_character_put, unicode_database_version \-
query the Unicode Character Database
.SH SYNOPSIS
.nf
.B #include <ucd.h>
.sp
.B const struct unicode_character_data *unicode_character_data(int32_t);
.sp
.B const struct unicode_character_data *unicode_character_lookup(const char *);
.sp
.B const struct unicode_character_data *unicode_character_get(const struct unicode_character_data *);
.sp
.B void unicode_character_put(const struct unicode_character_data *);
.sp
.B uint32_t unicode_database_version();
.fi
.SH DESCRIPTION
These functions query the Unicode Character Database for attribute
information about a certain character. Attribute information is
represented by a pointer to a reference-counted structure. Any
internal pointers in the structure should be assumed to point to
memory protected by the same reference count.
.PP
\fBunicode_character_data()\fP returns the attributes for a character
specified by Unicode index. If the argument is outside the Unicode
range (0 to 0x10ffff) it returns NULL with \fIerrno\fP set to EINVAL.
.PP
\fBunicode_character_lookup()\fP searches for a character by Unicode
name (e.g. "LATIN CAPITAL LETTER A".) If no character by that name is
known, it returns NULL with \fIerrno\fP set to EINVAL. Only exact
matches are returned; if a case-insensitive matching is desired, the
string should be upper-cased.
.PP
\fBunicode_character_get()\fP creates a new reference to an attribute
structure, and returns the new reference. This pointer may or may not
be a pointer to the original structure.
.PP
\fBunicode_character_put()\fP unreferences an attribute structure and
frees it if appropriate.
.PP
\fBunicode_character_version()\fP returns the version of the Unicode
database from which the library was generated, in the format
(major << 16)+(minor << 8)+(subminor).
.PP
The \fIunicode_character_data\fP structure is defined in <ucd.h> and
contains at least the following fields:
.sp
.RS
.nf
.ne 4
.ta 0n 4n 44n
struct unicode_character_data {
int32_t ucs;
uint16_t size;
uint64_t fl;
const char *name;
int32_t simple_uppercase;
int32_t simple_lowercase;
int32_t simple_titlecase;
uint8_t numeric_value_num;
uint8_t numeric_value_den;
uint8_t numeric_value_exp;
uint8_t age_ma, age_mi;
enum unicode_general_category general_category;
enum unicode_block block;
enum unicode_script script;
enum unicode_joining_type joining_type;
enum unicode_joining_group joining_group;
enum unicode_east_asian_width east_asian_width;
enum unicode_hangul_syllable_type hangul_syllable_type;
enum unicode_numeric_type numeric_type;
enum unicode_canonical_combining_class canonical_combining_class;
enum unicode_bidi_class bidi_class;
enum unicode_grapheme_cluster_break grapheme_cluster_break;
enum unicode_sentence_break sentence_break;
enum unicode_word_break word_break;
enum unicode_line_break line_break;
};
.ta
.fi
.RE
.PP
The members of the \fIunicode_character_data\fP structure are:
.TP
.B ucs
The Unicode index of the character.
.TP
.B size
The size of the structure, in bytes. This can be used to determine
the availability of a specific field if one is added in future
versions.
.TP
.B fl
A boolean combination of flags (UC_FL_), defined in ucd.h.
.TP
.B name
The Unicode name of the character.
.TP
.B bidi_mirroring_glyph
The Unicode string which corresponds to the mirror image of this
character. \fINot yet implemented.\fP
.TP
.B simple_uppercase
The simple (single codepoint) uppercase mapping string for this character.
.TP
.B simple_lowercase
The simple (single codepoint) lowercase mapping string for this character.
.TP
.B simple_titlecase
The simple (single codepoint) titlecase mapping string for this character.
.TP
.B numeric_value_num
.TP
.B numeric_value_den
.TP
.B numeric_value_exp
For a number, the numeric value is given as num/den * 10^exp.
.TP
.B age_ma
.TP
.B age_mi
Major and minor Unicode version when this character was introduced.
If this is a vacant codepoint, this has the value 0.0.
.PP
All enumerations are properties defined in the Unicode standard. Most
Unicode properties has both a long and a short form. The
corresponding strings can be obtained by calling the function
.sp
.B int unicode_property_names_\fIproperty\fP(enum
unicode_\fIproperty\fP \fIvalue\fP, const char **\fIlongname\fP, const char **\fIshortname\fP);
.sp
where the first argument is the enumeration value, and the \fIlongname\fP
and \fIshortname\fP arguments return pointers to the respective strings.
.SH "RETURN VALUE"
\fBunicode_character_data()\fP, \fBunicode_character_lookup()\fP, or
\fBunicode_character_get()\fP return an attribute structure pointer on
success, or NULL on failure. In the case of failure, \fIerrno\fP is
set to the appropriate error value (in the current implementation,
either EINVAL or ENOMEM.)
.PP
\fBunicode_database_version()\fP returns the version of the underlying
Unicode database, in the format (major << 16)+(minor << 8)+(subminor).
.PP
The \fBunicode_property_names\fP functions return zero on success, or
nonzero if the enumeration value was out of range.
.SH "BUGS"
The fields related to bidirectional mirroring and non-simple case
mappings are not yet populated.
.PP
There is no interface to the Unihan database. This perhaps should be
a separate library.
.SH "SEE ALSO"
The Unicode Standard,
.IR http://www.unicode.org/ .