| .\" ----------------------------------------------------------------------- |
| .\" |
| .\" Copyright 2005 H. Peter Anvin - All Rights Reserved |
| .\" |
| .\" Permission is hereby granted, free of charge, to any person |
| .\" obtaining a copy of this software and associated documentation |
| .\" files (the "Software"), to deal in the Software without |
| .\" restriction, including without limitation the rights to use, |
| .\" copy, modify, merge, publish, distribute, sublicense, and/or |
| .\" sell copies of the Software, and to permit persons to whom |
| .\" the Software is furnished to do so, subject to the following |
| .\" conditions: |
| .\" |
| .\" The above copyright notice and this permission notice shall |
| .\" be included in all copies or substantial portions of the Software. |
| .\" |
| .\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| .\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| .\" OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| .\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| .\" HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| .\" WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| .\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| .\" OTHER DEALINGS IN THE SOFTWARE. |
| .\" |
| .\" ----------------------------------------------------------------------- |
| .TH LIBUCD 3 2005-12-29 libucd "" |
| .SH NAME |
| unicode_character_data, unicode_character_lookup, unicode_character_get, |
| unicode_character_put, unicode_database_version \- |
| query the Unicode Character Database |
| .SH SYNOPSIS |
| .nf |
| .B #include <ucd.h> |
| .sp |
| .B const struct unicode_character_data *unicode_character_data(int32_t); |
| .sp |
| .B const struct unicode_character_data *unicode_character_lookup(const char *); |
| .sp |
| .B const struct unicode_character_data *unicode_character_get(const struct unicode_character_data *); |
| .sp |
| .B void unicode_character_put(const struct unicode_character_data *); |
| .sp |
| .B uint32_t unicode_database_version(); |
| .fi |
| .SH DESCRIPTION |
| These functions query the Unicode Character Database for attribute |
| information about a certain character. Attribute information is |
| represented by a pointer to a reference-counted structure. Any |
| internal pointers in the structure should be assumed to point to |
| memory protected by the same reference count. |
| .PP |
| \fBunicode_character_data()\fP returns the attributes for a character |
| specified by Unicode index. If the argument is outside the Unicode |
| range (0 to 0x10ffff) it returns NULL with \fIerrno\fP set to EINVAL. |
| .PP |
| \fBunicode_character_lookup()\fP searches for a character by Unicode |
| name (e.g. "LATIN CAPITAL LETTER A".) If no character by that name is |
| known, it returns NULL with \fIerrno\fP set to EINVAL. Only exact |
| matches are returned; if a case-insensitive matching is desired, the |
| string should be upper-cased. |
| .PP |
| \fBunicode_character_get()\fP creates a new reference to an attribute |
| structure, and returns the new reference. This pointer may or may not |
| be a pointer to the original structure. |
| .PP |
| \fBunicode_character_put()\fP unreferences an attribute structure and |
| frees it if appropriate. |
| .PP |
| \fBunicode_character_version()\fP returns the version of the Unicode |
| database from which the library was generated, in the format |
| (major << 16)+(minor << 8)+(subminor). |
| .PP |
| The \fIunicode_character_data\fP structure is defined in <ucd.h> and |
| contains at least the following fields: |
| .sp |
| .RS |
| .nf |
| .ne 4 |
| .ta 0n 4n 44n |
| struct unicode_character_data { |
| int32_t ucs; |
| uint16_t size; |
| uint64_t fl; |
| const char *name; |
| int32_t simple_uppercase; |
| int32_t simple_lowercase; |
| int32_t simple_titlecase; |
| uint8_t numeric_value_num; |
| uint8_t numeric_value_den; |
| uint8_t numeric_value_exp; |
| uint8_t age_ma, age_mi; |
| enum unicode_general_category general_category; |
| enum unicode_block block; |
| enum unicode_script script; |
| enum unicode_joining_type joining_type; |
| enum unicode_joining_group joining_group; |
| enum unicode_east_asian_width east_asian_width; |
| enum unicode_hangul_syllable_type hangul_syllable_type; |
| enum unicode_numeric_type numeric_type; |
| enum unicode_canonical_combining_class canonical_combining_class; |
| enum unicode_bidi_class bidi_class; |
| enum unicode_grapheme_cluster_break grapheme_cluster_break; |
| enum unicode_sentence_break sentence_break; |
| enum unicode_word_break word_break; |
| enum unicode_line_break line_break; |
| }; |
| .ta |
| .fi |
| .RE |
| .PP |
| The members of the \fIunicode_character_data\fP structure are: |
| .TP |
| .B ucs |
| The Unicode index of the character. |
| .TP |
| .B size |
| The size of the structure, in bytes. This can be used to determine |
| the availability of a specific field if one is added in future |
| versions. |
| .TP |
| .B fl |
| A boolean combination of flags (UC_FL_), defined in ucd.h. |
| .TP |
| .B name |
| The Unicode name of the character. |
| .TP |
| .B bidi_mirroring_glyph |
| The Unicode string which corresponds to the mirror image of this |
| character. \fINot yet implemented.\fP |
| .TP |
| .B simple_uppercase |
| The simple (single codepoint) uppercase mapping string for this character. |
| .TP |
| .B simple_lowercase |
| The simple (single codepoint) lowercase mapping string for this character. |
| .TP |
| .B simple_titlecase |
| The simple (single codepoint) titlecase mapping string for this character. |
| .TP |
| .B numeric_value_num |
| .TP |
| .B numeric_value_den |
| .TP |
| .B numeric_value_exp |
| For a number, the numeric value is given as num/den * 10^exp. |
| .TP |
| .B age_ma |
| .TP |
| .B age_mi |
| Major and minor Unicode version when this character was introduced. |
| If this is a vacant codepoint, this has the value 0.0. |
| .PP |
| All enumerations are properties defined in the Unicode standard. Most |
| Unicode properties has both a long and a short form. The |
| corresponding strings can be obtained by calling the function |
| .sp |
| .B int unicode_property_names_\fIproperty\fP(enum |
| unicode_\fIproperty\fP \fIvalue\fP, const char **\fIlongname\fP, const char **\fIshortname\fP); |
| .sp |
| where the first argument is the enumeration value, and the \fIlongname\fP |
| and \fIshortname\fP arguments return pointers to the respective strings. |
| .SH "RETURN VALUE" |
| \fBunicode_character_data()\fP, \fBunicode_character_lookup()\fP, or |
| \fBunicode_character_get()\fP return an attribute structure pointer on |
| success, or NULL on failure. In the case of failure, \fIerrno\fP is |
| set to the appropriate error value (in the current implementation, |
| either EINVAL or ENOMEM.) |
| .PP |
| \fBunicode_database_version()\fP returns the version of the underlying |
| Unicode database, in the format (major << 16)+(minor << 8)+(subminor). |
| .PP |
| The \fBunicode_property_names\fP functions return zero on success, or |
| nonzero if the enumeration value was out of range. |
| .SH "BUGS" |
| The fields related to bidirectional mirroring and non-simple case |
| mappings are not yet populated. |
| .PP |
| There is no interface to the Unihan database. This perhaps should be |
| a separate library. |
| .SH "SEE ALSO" |
| The Unicode Standard, |
| .IR http://www.unicode.org/ . |
| |