Geant4 Cross Reference

Cross-Referencing   Geant4
Geant4/externals/g4tools/include/tools/toojpeg.icc

Version: [ ReleaseNotes ] [ 1.0 ] [ 1.1 ] [ 2.0 ] [ 3.0 ] [ 3.1 ] [ 3.2 ] [ 4.0 ] [ 4.0.p1 ] [ 4.0.p2 ] [ 4.1 ] [ 4.1.p1 ] [ 5.0 ] [ 5.0.p1 ] [ 5.1 ] [ 5.1.p1 ] [ 5.2 ] [ 5.2.p1 ] [ 5.2.p2 ] [ 6.0 ] [ 6.0.p1 ] [ 6.1 ] [ 6.2 ] [ 6.2.p1 ] [ 6.2.p2 ] [ 7.0 ] [ 7.0.p1 ] [ 7.1 ] [ 7.1.p1 ] [ 8.0 ] [ 8.0.p1 ] [ 8.1 ] [ 8.1.p1 ] [ 8.1.p2 ] [ 8.2 ] [ 8.2.p1 ] [ 8.3 ] [ 8.3.p1 ] [ 8.3.p2 ] [ 9.0 ] [ 9.0.p1 ] [ 9.0.p2 ] [ 9.1 ] [ 9.1.p1 ] [ 9.1.p2 ] [ 9.1.p3 ] [ 9.2 ] [ 9.2.p1 ] [ 9.2.p2 ] [ 9.2.p3 ] [ 9.2.p4 ] [ 9.3 ] [ 9.3.p1 ] [ 9.3.p2 ] [ 9.4 ] [ 9.4.p1 ] [ 9.4.p2 ] [ 9.4.p3 ] [ 9.4.p4 ] [ 9.5 ] [ 9.5.p1 ] [ 9.5.p2 ] [ 9.6 ] [ 9.6.p1 ] [ 9.6.p2 ] [ 9.6.p3 ] [ 9.6.p4 ] [ 10.0 ] [ 10.0.p1 ] [ 10.0.p2 ] [ 10.0.p3 ] [ 10.0.p4 ] [ 10.1 ] [ 10.1.p1 ] [ 10.1.p2 ] [ 10.1.p3 ] [ 10.2 ] [ 10.2.p1 ] [ 10.2.p2 ] [ 10.2.p3 ] [ 10.3 ] [ 10.3.p1 ] [ 10.3.p2 ] [ 10.3.p3 ] [ 10.4 ] [ 10.4.p1 ] [ 10.4.p2 ] [ 10.4.p3 ] [ 10.5 ] [ 10.5.p1 ] [ 10.6 ] [ 10.6.p1 ] [ 10.6.p2 ] [ 10.6.p3 ] [ 10.7 ] [ 10.7.p1 ] [ 10.7.p2 ] [ 10.7.p3 ] [ 10.7.p4 ] [ 11.0 ] [ 11.0.p1 ] [ 11.0.p2 ] [ 11.0.p3, ] [ 11.0.p4 ] [ 11.1 ] [ 11.1.1 ] [ 11.1.2 ] [ 11.1.3 ] [ 11.2 ] [ 11.2.1 ] [ 11.2.2 ] [ 11.3.0 ]

  1 
  2 // G.Barrand: pure header version of toojpeg found at https://github.com/stbrumme/toojpeg
  3 
  4 // //////////////////////////////////////////////////////////
  5 // toojpeg.cpp
  6 // written by Stephan Brumme, 2018-2019
  7 // see https://create.stephan-brumme.com/toojpeg/
  8 //
  9 
 10 #include <cstddef> //size_t
 11 
 12 // - the "official" specifications: https://www.w3.org/Graphics/JPEG/itu-t81.pdf and https://www.w3.org/Graphics/JPEG/jfif3.pdf
 13 // - Wikipedia has a short description of the JFIF/JPEG file format: https://en.wikipedia.org/wiki/JPEG_File_Interchange_Format
 14 // - the popular STB Image library includes Jon's JPEG encoder as well: https://github.com/nothings/stb/blob/master/stb_image_write.h
 15 // - the most readable JPEG book (from a developer's perspective) is Miano's "Compressed Image File Formats" (1999, ISBN 0-201-60443-4),
 16 //   used copies are really cheap nowadays and include a CD with C++ sources as well (plus great format descriptions of GIF & PNG)
 17 // - much more detailled is Mitchell/Pennebaker's "JPEG: Still Image Data Compression Standard" (1993, ISBN 0-442-01272-1)
 18 //   which contains the official JPEG standard, too - fun fact: I bought a signed copy in a second-hand store without noticing
 19 
 20 namespace tools {
 21 namespace toojpeg {
 22 // ////////////////////////////////////////
 23 // data types
 24 typedef unsigned char uint8_t;
 25 typedef unsigned short uint16_t;
 26 typedef short int16_t;
 27 typedef int int32_t; // at least four bytes
 28 
 29 // ////////////////////////////////////////
 30 // constants
 31 
 32 // quantization tables from JPEG Standard, Annex K
 33 const uint8_t DefaultQuantLuminance[8*8] =
 34     { 16, 11, 10, 16, 24, 40, 51, 61, // there are a few experts proposing slightly more efficient values,
 35       12, 12, 14, 19, 26, 58, 60, 55, // e.g. https://www.imagemagick.org/discourse-server/viewtopic.php?t=20333
 36       14, 13, 16, 24, 40, 57, 69, 56, // btw: Google's Guetzli project optimizes the quantization tables per image
 37       14, 17, 22, 29, 51, 87, 80, 62,
 38       18, 22, 37, 56, 68,109,103, 77,
 39       24, 35, 55, 64, 81,104,113, 92,
 40       49, 64, 78, 87,103,121,120,101,
 41       72, 92, 95, 98,112,100,103, 99 };
 42 const uint8_t DefaultQuantChrominance[8*8] =
 43     { 17, 18, 24, 47, 99, 99, 99, 99,
 44       18, 21, 26, 66, 99, 99, 99, 99,
 45       24, 26, 56, 99, 99, 99, 99, 99,
 46       47, 66, 99, 99, 99, 99, 99, 99,
 47       99, 99, 99, 99, 99, 99, 99, 99,
 48       99, 99, 99, 99, 99, 99, 99, 99,
 49       99, 99, 99, 99, 99, 99, 99, 99,
 50       99, 99, 99, 99, 99, 99, 99, 99 };
 51 
 52 // 8x8 blocks are processed in zig-zag order
 53 // most encoders use a zig-zag "forward" table, I switched to its inverse for performance reasons
 54 // note: ZigZagInv[ZigZag[i]] = i
 55 const uint8_t ZigZagInv[8*8] =
 56     {  0, 1, 8,16, 9, 2, 3,10,   // ZigZag[] =  0, 1, 5, 6,14,15,27,28,
 57       17,24,32,25,18,11, 4, 5,   //             2, 4, 7,13,16,26,29,42,
 58       12,19,26,33,40,48,41,34,   //             3, 8,12,17,25,30,41,43,
 59       27,20,13, 6, 7,14,21,28,   //             9,11,18,24,31,40,44,53,
 60       35,42,49,56,57,50,43,36,   //            10,19,23,32,39,45,52,54,
 61       29,22,15,23,30,37,44,51,   //            20,22,33,38,46,51,55,60,
 62       58,59,52,45,38,31,39,46,   //            21,34,37,47,50,56,59,61,
 63       53,60,61,54,47,55,62,63 }; //            35,36,48,49,57,58,62,63
 64 
 65 // static Huffman code tables from JPEG standard Annex K
 66 // - CodesPerBitsize tables define how many Huffman codes will have a certain bitsize (plus 1 because there nothing with zero bits),
 67 //   e.g. DcLuminanceCodesPerBitsize[2] = 5 because there are 5 Huffman codes being 2+1=3 bits long
 68 // - Values tables are a list of values ordered by their Huffman code bitsize,
 69 //   e.g. AcLuminanceValues => Huffman(0x01,0x02 and 0x03) will have 2 bits, Huffman(0x00) will have 3 bits, Huffman(0x04,0x11 and 0x05) will have 4 bits, ...
 70 
 71 // Huffman definitions for first DC/AC tables (luminance / Y channel)
 72 const uint8_t DcLuminanceCodesPerBitsize[16]   = { 0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0 };   // sum = 12
 73 const uint8_t DcLuminanceValues         [12]   = { 0,1,2,3,4,5,6,7,8,9,10,11 };         // => 12 codes
 74 const uint8_t AcLuminanceCodesPerBitsize[16]   = { 0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,125 }; // sum = 162
 75 const uint8_t AcLuminanceValues        [162]   =                                        // => 162 codes
 76     { 0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xA1,0x08, // 16*10+2 symbols because
 77       0x23,0x42,0xB1,0xC1,0x15,0x52,0xD1,0xF0,0x24,0x33,0x62,0x72,0x82,0x09,0x0A,0x16,0x17,0x18,0x19,0x1A,0x25,0x26,0x27,0x28, // upper 4 bits can be 0..F
 78       0x29,0x2A,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x53,0x54,0x55,0x56,0x57,0x58,0x59, // while lower 4 bits can be 1..A
 79       0x5A,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x83,0x84,0x85,0x86,0x87,0x88,0x89, // plus two special codes 0x00 and 0xF0
 80       0x8A,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xB2,0xB3,0xB4,0xB5,0xB6, // order of these symbols was determined empirically by JPEG committee
 81       0xB7,0xB8,0xB9,0xBA,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,0xE1,0xE2,
 82       0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA };
 83 // Huffman definitions for second DC/AC tables (chrominance / Cb and Cr channels)
 84 const uint8_t DcChrominanceCodesPerBitsize[16] = { 0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0 };   // sum = 12
 85 const uint8_t DcChrominanceValues         [12] = { 0,1,2,3,4,5,6,7,8,9,10,11 };         // => 12 codes (identical to DcLuminanceValues)
 86 const uint8_t AcChrominanceCodesPerBitsize[16] = { 0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,119 }; // sum = 162
 87 const uint8_t AcChrominanceValues        [162] =                                        // => 162 codes
 88     { 0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91, // same number of symbol, just different order
 89       0xA1,0xB1,0xC1,0x09,0x23,0x33,0x52,0xF0,0x15,0x62,0x72,0xD1,0x0A,0x16,0x24,0x34,0xE1,0x25,0xF1,0x17,0x18,0x19,0x1A,0x26, // (which is more efficient for AC coding)
 90       0x27,0x28,0x29,0x2A,0x35,0x36,0x37,0x38,0x39,0x3A,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x53,0x54,0x55,0x56,0x57,0x58,
 91       0x59,0x5A,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x82,0x83,0x84,0x85,0x86,0x87,
 92       0x88,0x89,0x8A,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xAA,0xB2,0xB3,0xB4,
 93       0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,0xD8,0xD9,0xDA,
 94       0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA };
 95 const int16_t CodeWordLimit = 2048; // +/-2^11, maximum value after DCT
 96 
 97 // ////////////////////////////////////////
 98 // structs
 99 
100 // represent a single Huffman code
101 struct BitCode
102 {
103   //BitCode() = default; // undefined state, must be initialized at a later time
104   BitCode():code(0),numBits(0) {}
105   BitCode(const BitCode& a_from):code(a_from.code),numBits(a_from.numBits) {}
106   BitCode& operator=(const BitCode& a_from) {
107     code = a_from.code;
108     numBits = a_from.numBits;
109     return *this;
110   }    
111   
112   BitCode(uint16_t code_, uint8_t numBits_)
113   : code(code_), numBits(numBits_) {}
114   uint16_t code;       // JPEG's Huffman codes are limited to 16 bits
115   uint8_t  numBits;    // number of valid bits
116 };
117 
118 // wrapper for bit output operations
119 struct BitWriter
120 {
121   // user-supplied callback that writes/stores one byte
122   WRITE_ONE_BYTE output;
123   void* tag;
124   // initialize writer
125   explicit BitWriter(WRITE_ONE_BYTE output_,void* tag_) : output(output_),tag(tag_) {
126     buffer.data = 0;
127     buffer.numBits = 0;
128   }
129 
130   // store the most recently encoded bits that are not written yet
131   struct BitBuffer
132   {
133     int32_t data    /*= 0*/; // actually only at most 24 bits are used
134     uint8_t numBits /*= 0*/; // number of valid bits (the right-most bits)
135   } buffer;
136 
137   // write Huffman bits stored in BitCode, keep excess bits in BitBuffer
138   BitWriter& operator<<(const BitCode& data)
139   {
140     // append the new bits to those bits leftover from previous call(s)
141     buffer.numBits += data.numBits;
142     buffer.data   <<= data.numBits;
143     buffer.data    |= data.code;
144 
145     // write all "full" bytes
146     while (buffer.numBits >= 8)
147     {
148       // extract highest 8 bits
149       buffer.numBits -= 8;
150       uint8_t oneByte = uint8_t(buffer.data >> buffer.numBits);
151       output(oneByte,tag);
152 
153       if (oneByte == 0xFF) // 0xFF has a special meaning for JPEGs (it's a block marker)
154         output(0,tag);         // therefore pad a zero to indicate "nope, this one ain't a marker, it's just a coincidence"
155 
156       // note: I don't clear those written bits, therefore buffer.bits may contain garbage in the high bits
157       //       if you really want to "clean up" (e.g. for debugging purposes) then uncomment the following line
158       //buffer.bits &= (1 << buffer.numBits) - 1;
159     }
160     return *this;
161   }
162 
163   // write all non-yet-written bits, fill gaps with 1s (that's a strange JPEG thing)
164   void flush()
165   {
166     // at most seven set bits needed to "fill" the last byte: 0x7F = binary 0111 1111
167     *this << BitCode(0x7F, 7); // I should set buffer.numBits = 0 but since there are no single bits written after flush() I can safely ignore it
168   }
169 
170   // NOTE: all the following BitWriter functions IGNORE the BitBuffer and write straight to output !
171   // write a single byte
172   BitWriter& operator<<(uint8_t oneByte)
173   {
174     output(oneByte,tag);
175     return *this;
176   }
177 
178   // write an array of bytes
179   template <typename T, int Size>
180   BitWriter& operator<<(T (&manyBytes)[Size])
181   {
182   //for (auto c : manyBytes)
183   //  output(c);
184     for(size_t i=0;i<Size;i++) output(manyBytes[i],tag);
185     return *this;
186   }
187 
188   // start a new JFIF block
189   void addMarker(uint8_t id, uint16_t length)
190   {
191     output(0xFF,tag); output(id,tag);     // ID, always preceded by 0xFF
192     output(uint8_t(length >> 8),tag); // length of the block (big-endian, includes the 2 length bytes as well)
193     output(uint8_t(length & 0xFF),tag);
194   }
195 };
196 
197 // ////////////////////////////////////////
198 // functions / templates
199 
200 // same as std::min()
201 template <typename Number>
202 inline Number minimum(Number value, Number maximum)
203 {
204   return value <= maximum ? value : maximum;
205 }
206 
207 // restrict a value to the interval [minimum, maximum]
208 template <typename Number, typename Limit>
209 inline Number clamp(Number value, Limit minValue, Limit maxValue)
210 {
211   if (value <= minValue) return minValue; // never smaller than the minimum
212   if (value >= maxValue) return maxValue; // never bigger  than the maximum
213   return value;                           // value was inside interval, keep it
214 }
215 
216 // convert from RGB to YCbCr, constants are similar to ITU-R, see https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion
217 inline float rgb2y (float r, float g, float b) { return +0.299f   * r +0.587f   * g +0.114f   * b; }
218 inline float rgb2cb(float r, float g, float b) { return -0.16874f * r -0.33126f * g +0.5f     * b; }
219 inline float rgb2cr(float r, float g, float b) { return +0.5f     * r -0.41869f * g -0.08131f * b; }
220 
221 // forward DCT computation "in one dimension" (fast AAN algorithm by Arai, Agui and Nakajima: "A fast DCT-SQ scheme for images")
222 inline void DCT(float block[8*8], uint8_t stride) // stride must be 1 (=horizontal) or 8 (=vertical)
223 {
224   const float SqrtHalfSqrt = 1.306562965f; //    sqrt((2 + sqrt(2)) / 2) = cos(pi * 1 / 8) * sqrt(2)
225   const float InvSqrt      = 0.707106781f; // 1 / sqrt(2)                = cos(pi * 2 / 8)
226   const float HalfSqrtSqrt = 0.382683432f; //     sqrt(2 - sqrt(2)) / 2  = cos(pi * 3 / 8)
227   const float InvSqrtSqrt  = 0.541196100f; // 1 / sqrt(2 - sqrt(2))      = cos(pi * 3 / 8) * sqrt(2)
228 
229   // modify in-place
230   float& block0 = block[0         ];
231   float& block1 = block[1 * stride];
232   float& block2 = block[2 * stride];
233   float& block3 = block[3 * stride];
234   float& block4 = block[4 * stride];
235   float& block5 = block[5 * stride];
236   float& block6 = block[6 * stride];
237   float& block7 = block[7 * stride];
238 
239   // based on https://dev.w3.org/Amaya/libjpeg/jfdctflt.c , the original variable names can be found in my comments
240   float add07 = block0 + block7; float sub07 = block0 - block7; // tmp0, tmp7
241   float add16 = block1 + block6; float sub16 = block1 - block6; // tmp1, tmp6
242   float add25 = block2 + block5; float sub25 = block2 - block5; // tmp2, tmp5
243   float add34 = block3 + block4; float sub34 = block3 - block4; // tmp3, tmp4
244 
245   float add0347 = add07 + add34; float sub07_34 = add07 - add34; // tmp10, tmp13 ("even part" / "phase 2")
246   float add1256 = add16 + add25; float sub16_25 = add16 - add25; // tmp11, tmp12
247 
248   block0 = add0347 + add1256; block4 = add0347 - add1256; // "phase 3"
249 
250   float z1 = (sub16_25 + sub07_34) * InvSqrt; // all temporary z-variables kept their original names
251   block2 = sub07_34 + z1; block6 = sub07_34 - z1; // "phase 5"
252 
253   float sub23_45 = sub25 + sub34; // tmp10 ("odd part" / "phase 2")
254   float sub12_56 = sub16 + sub25; // tmp11
255   float sub01_67 = sub16 + sub07; // tmp12
256 
257   float z5 = (sub23_45 - sub01_67) * HalfSqrtSqrt;
258   float z2 = sub23_45 * InvSqrtSqrt  + z5;
259   float z3 = sub12_56 * InvSqrt;
260   float z4 = sub01_67 * SqrtHalfSqrt + z5;
261   float z6 = sub07 + z3; // z11 ("phase 5")
262   float z7 = sub07 - z3; // z13
263   block1 = z6 + z4; block7 = z6 - z4; // "phase 6"
264   block5 = z7 + z2; block3 = z7 - z2;
265 }
266 
267 // run DCT, quantize and write Huffman bit codes
268 inline int16_t encodeBlock(BitWriter& writer, float block[8][8], const float scaled[8*8], int16_t lastDC,
269                     const BitCode huffmanDC[256], const BitCode huffmanAC[256], const BitCode* codewords)
270 {
271   // "linearize" the 8x8 block, treat it as a flat array of 64 floats
272   float* block64 = (float*) block;
273 
274   // DCT: rows
275   for (size_t offset = 0; offset < 8; offset++)
276     DCT(block64 + offset*8, 1);
277   // DCT: columns
278   for (size_t offset = 0; offset < 8; offset++)
279     DCT(block64 + offset*1, 8);
280 
281   // scale
282   for (size_t i = 0; i < 8*8; i++)
283     block64[i] *= scaled[i];
284 
285   // encode DC (the first coefficient is the "average color" of the 8x8 block)
286   int DC = int(block64[0] + (block64[0] >= 0 ? +0.5f : -0.5f)); // C++11's nearbyint() achieves a similar effect
287 
288   // quantize and zigzag the other 63 coefficients
289   size_t posNonZero = 0; // find last coefficient which is not zero (because trailing zeros are encoded differently)
290   int16_t quantized[8*8];
291   for (size_t i = 1; i < 8*8; i++) // start at 1 because block64[0]=DC was already processed
292   {
293     float value = block64[ZigZagInv[i]];
294     // round to nearest integer
295     quantized[i] = int(value + (value >= 0 ? +0.5f : -0.5f)); // C++11's nearbyint() achieves a similar effect
296     // remember offset of last non-zero coefficient
297     if (quantized[i] != 0)
298       posNonZero = i;
299   }
300 
301   // same "average color" as previous block ?
302   int diff = DC - lastDC;
303   if (diff == 0)
304     writer << huffmanDC[0x00];   // yes, write a special short symbol
305   else
306   {
307     const BitCode bits = codewords[diff]; // nope, encode the difference to previous block's average color
308     writer << huffmanDC[bits.numBits] << bits;
309   }
310 
311   // encode ACs (quantized[1..63])
312   size_t offset = 0; // upper 4 bits count the number of consecutive zeros
313   for (size_t i = 1; i <= posNonZero; i++) // quantized[0] was already written, skip all trailing zeros, too
314   {
315     // zeros are encoded in a special way
316     while (quantized[i] == 0) // found another zero ?
317     {
318       offset    += 0x10; // add 1 to the upper 4 bits
319       // split into blocks of at most 16 consecutive zeros
320       if (offset > 0xF0) // remember, the counter is in the upper 4 bits, 0xF = 15
321       {
322         writer << huffmanAC[0xF0]; // 0xF0 is a special code for "16 zeros"
323         offset = 0;
324       }
325       i++;
326     }
327 
328     const BitCode encoded = codewords[quantized[i]];
329     // combine number of zeros with the number of bits of the next non-zero value
330     writer << huffmanAC[offset + encoded.numBits] << encoded; // and the value itself
331     offset = 0;
332   }
333 
334   // send end-of-block code (0x00), only needed if there are trailing zeros
335   if (posNonZero < 8*8 - 1) // = 63
336     writer << huffmanAC[0x00];
337 
338   return DC;
339 }
340 
341 // Jon's code includes the pre-generated Huffman codes
342 // I don't like these "magic constants" and compute them on my own :-)
343 inline void generateHuffmanTable(const uint8_t numCodes[16], const uint8_t* values, BitCode result[256])
344 {
345   // process all bitsizes 1 thru 16, no JPEG Huffman code is allowed to exceed 16 bits
346   uint16_t huffmanCode = 0;
347   for (uint8_t numBits = 1; numBits <= 16; numBits++)
348   {
349     // ... and each code of these bitsizes
350     for (uint8_t i = 0; i < numCodes[numBits - 1]; i++) // note: numCodes array starts at zero, but smallest bitsize is 1
351       result[*values++] = BitCode(huffmanCode++, numBits);
352 
353     // next Huffman code needs to be one bit wider
354     huffmanCode <<= 1;
355   }
356 }
357 
358 // -------------------- externally visible code --------------------
359 
360 // the only exported function ...
361 inline bool writeJpeg(WRITE_ONE_BYTE output, void* tag,const void* pixels_, unsigned short width, unsigned short height,
362                bool isRGB, unsigned char quality_, bool downsample, const char* comment)
363 {
364   // reject invalid pointers
365   if (output == 0/*nullptr*/ || pixels_ == 0/*nullptr*/)
366     return false;
367   // check image format
368   if (width == 0 || height == 0)
369     return false;
370 
371   // number of components
372   const uint16_t numComponents = isRGB ? 3 : 1;
373   // note: if there is just one component (=grayscale), then only luminance needs to be stored in the file
374   //       thus everything related to chrominance need not to be written to the JPEG
375   //       I still compute a few things, like quantization tables to avoid a complete code mess
376 
377   // grayscale images can't be downsampled (because there are no Cb + Cr channels)
378   if (!isRGB)
379     downsample = false;
380 
381   // wrapper for all output operations
382   BitWriter bitWriter(output,tag);
383 
384   // ////////////////////////////////////////
385   // JFIF headers
386   const uint8_t HeaderJfif[2+2+16] =
387       { 0xFF,0xD8,         // SOI marker (start of image)
388         0xFF,0xE0,         // JFIF APP0 tag
389         0,16,              // length: 16 bytes (14 bytes payload + 2 bytes for this length field)
390         'J','F','I','F',0, // JFIF identifier, zero-terminated
391         1,1,               // JFIF version 1.1
392         0,                 // no density units specified
393         0,1,0,1,           // density: 1 pixel "per pixel" horizontally and vertically
394         0,0 };             // no thumbnail (size 0 x 0)
395   bitWriter << HeaderJfif;
396 
397   // ////////////////////////////////////////
398   // comment (optional)
399   if (comment != 0/*nullptr*/)
400   {
401     // look for zero terminator
402     uint16_t length = 0; // = strlen(comment);
403     while (comment[length] != 0)
404       length++;
405 
406     // write COM marker
407     bitWriter.addMarker(0xFE, 2+length); // block size is number of bytes (without zero terminator) + 2 bytes for this length field
408     // ... and write the comment itself
409     for (uint16_t i = 0; i < length; i++)
410       bitWriter << comment[i];
411   }
412 
413   // ////////////////////////////////////////
414   // adjust quantization tables to desired quality
415 
416   // quality level must be in 1 ... 100
417   uint16_t quality = clamp<uint16_t>(quality_, 1, 100);
418   // convert to an internal JPEG quality factor, formula taken from libjpeg
419   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
420 
421   uint8_t quantLuminance  [8*8];
422   uint8_t quantChrominance[8*8];
423   for (size_t i = 0; i < 8*8; i++)
424   {
425     int luminance   = (DefaultQuantLuminance  [ZigZagInv[i]] * quality + 50) / 100;
426     int chrominance = (DefaultQuantChrominance[ZigZagInv[i]] * quality + 50) / 100;
427 
428     // clamp to 1..255
429     quantLuminance  [i] = clamp(luminance,   1, 255);
430     quantChrominance[i] = clamp(chrominance, 1, 255);
431   }
432 
433   // write quantization tables
434   bitWriter.addMarker(0xDB, 2 + (isRGB ? 2 : 1) * (1 + 8*8)); // length: 65 bytes per table + 2 bytes for this length field
435                                                               // each table has 64 entries and is preceded by an ID byte
436 
437   bitWriter   << 0x00 << quantLuminance;   // first  quantization table
438   if (isRGB)
439     bitWriter << 0x01 << quantChrominance; // second quantization table, only relevant for color images
440 
441   // ////////////////////////////////////////
442   // write image infos (SOF0 - start of frame)
443   bitWriter.addMarker(0xC0, 2+6+3*numComponents); // length: 6 bytes general info + 3 per channel + 2 bytes for this length field
444 
445   // 8 bits per channel
446   bitWriter << 0x08
447   // image dimensions (big-endian)
448             << (height >> 8) << (height & 0xFF)
449             << (width  >> 8) << (width  & 0xFF);
450 
451   // sampling and quantization tables for each component
452   bitWriter << numComponents;       // 1 component (grayscale, Y only) or 3 components (Y,Cb,Cr)
453   for (uint16_t id = 1; id <= numComponents; id++)
454     bitWriter <<  id                // component ID (Y=1, Cb=2, Cr=3)
455     // bitmasks for sampling: highest 4 bits: horizontal, lowest 4 bits: vertical
456               << (id == 1 && downsample ? 0x22 : 0x11) // 0x11 is default YCbCr 4:4:4 and 0x22 stands for YCbCr 4:2:0
457               << (id == 1 ? 0 : 1); // use quantization table 0 for Y, table 1 for Cb and Cr
458 
459   // ////////////////////////////////////////
460   // Huffman tables
461   // DHT marker - define Huffman tables
462   bitWriter.addMarker(0xC4, isRGB ? (2+208+208) : (2+208));
463                             // 2 bytes for the length field, store chrominance only if needed
464                             //   1+16+12  for the DC luminance
465                             //   1+16+162 for the AC luminance   (208 = 1+16+12 + 1+16+162)
466                             //   1+16+12  for the DC chrominance
467                             //   1+16+162 for the AC chrominance (208 = 1+16+12 + 1+16+162, same as above)
468 
469   // store luminance's DC+AC Huffman table definitions
470   bitWriter << 0x00 // highest 4 bits: 0 => DC, lowest 4 bits: 0 => Y (baseline)
471             << DcLuminanceCodesPerBitsize
472             << DcLuminanceValues;
473   bitWriter << 0x10 // highest 4 bits: 1 => AC, lowest 4 bits: 0 => Y (baseline)
474             << AcLuminanceCodesPerBitsize
475             << AcLuminanceValues;
476 
477   // compute actual Huffman code tables (see Jon's code for precalculated tables)
478   BitCode huffmanLuminanceDC[256];
479   BitCode huffmanLuminanceAC[256];
480   generateHuffmanTable(DcLuminanceCodesPerBitsize, DcLuminanceValues, huffmanLuminanceDC);
481   generateHuffmanTable(AcLuminanceCodesPerBitsize, AcLuminanceValues, huffmanLuminanceAC);
482 
483   // chrominance is only relevant for color images
484   BitCode huffmanChrominanceDC[256];
485   BitCode huffmanChrominanceAC[256];
486   if (isRGB)
487   {
488     // store luminance's DC+AC Huffman table definitions
489     bitWriter << 0x01 // highest 4 bits: 0 => DC, lowest 4 bits: 1 => Cr,Cb (baseline)
490               << DcChrominanceCodesPerBitsize
491               << DcChrominanceValues;
492     bitWriter << 0x11 // highest 4 bits: 1 => AC, lowest 4 bits: 1 => Cr,Cb (baseline)
493               << AcChrominanceCodesPerBitsize
494               << AcChrominanceValues;
495 
496     // compute actual Huffman code tables (see Jon's code for precalculated tables)
497     generateHuffmanTable(DcChrominanceCodesPerBitsize, DcChrominanceValues, huffmanChrominanceDC);
498     generateHuffmanTable(AcChrominanceCodesPerBitsize, AcChrominanceValues, huffmanChrominanceAC);
499   }
500 
501   // ////////////////////////////////////////
502   // start of scan (there is only a single scan for baseline JPEGs)
503   bitWriter.addMarker(0xDA, 2+1+2*numComponents+3); // 2 bytes for the length field, 1 byte for number of components,
504                                                     // then 2 bytes for each component and 3 bytes for spectral selection
505 
506   // assign Huffman tables to each component
507   bitWriter << numComponents;
508   for (uint16_t id = 1; id <= numComponents; id++)
509     // highest 4 bits: DC Huffman table, lowest 4 bits: AC Huffman table
510     bitWriter << id << (id == 1 ? 0x00 : 0x11); // Y: tables 0 for DC and AC; Cb + Cr: tables 1 for DC and AC
511 
512   // constant values for our baseline JPEGs (which have a single sequential scan)
513   static const uint8_t Spectral[3] = { 0, 63, 0 }; // spectral selection: must be from 0 to 63; successive approximation must be 0
514   bitWriter << Spectral;
515 
516   // ////////////////////////////////////////
517   // adjust quantization tables with AAN scaling factors to simplify DCT
518   float scaledLuminance  [8*8];
519   float scaledChrominance[8*8];
520   for (size_t i = 0; i < 8*8; i++)
521   {
522     size_t row    = ZigZagInv[i] / 8; // same as ZigZagInv[i] >> 3
523     size_t column = ZigZagInv[i] % 8; // same as ZigZagInv[i] &  7
524 
525     // scaling constants for AAN DCT algorithm: AanScaleFactors[0] = 1, AanScaleFactors[k=1..7] = cos(k*PI/16) * sqrt(2)
526     static const float AanScaleFactors[8] = { 1, 1.387039845f, 1.306562965f, 1.175875602f, 1, 0.785694958f, 0.541196100f, 0.275899379f };
527     float factor = 1 / (AanScaleFactors[row] * AanScaleFactors[column] * 8);
528     scaledLuminance  [ZigZagInv[i]] = factor / quantLuminance  [i];
529     scaledChrominance[ZigZagInv[i]] = factor / quantChrominance[i];
530     // if you really want JPEGs that are bitwise identical to Jon Olick's code then you need slightly different formulas (note: sqrt(8) = 2.828427125f)
531     //static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f }; // line 240 of jo_jpeg.cpp
532     //scaledLuminance  [ZigZagInv[i]] = 1 / (quantLuminance  [i] * aasf[row] * aasf[column]); // lines 266-267 of jo_jpeg.cpp
533     //scaledChrominance[ZigZagInv[i]] = 1 / (quantChrominance[i] * aasf[row] * aasf[column]);
534   }
535 
536   // ////////////////////////////////////////
537   // precompute JPEG codewords for quantized DCT
538   BitCode  codewordsArray[2 * CodeWordLimit];          // note: quantized[i] is found at codewordsArray[quantized[i] + CodeWordLimit]
539   BitCode* codewords = &codewordsArray[CodeWordLimit]; // allow negative indices, so quantized[i] is at codewords[quantized[i]]
540   uint8_t numBits = 1; // each codeword has at least one bit (value == 0 is undefined)
541   int32_t mask    = 1; // mask is always 2^numBits - 1, initial value 2^1-1 = 2-1 = 1
542   for (int16_t value = 1; value < CodeWordLimit; value++)
543   {
544     // numBits = position of highest set bit (ignoring the sign)
545     // mask    = (2^numBits) - 1
546     if (value > mask) // one more bit ?
547     {
548       numBits++;
549       mask = (mask << 1) | 1; // append a set bit
550     }
551     codewords[-value] = BitCode(mask - value, numBits); // note that I use a negative index => codewords[-value] = codewordsArray[CodeWordLimit  value]
552     codewords[+value] = BitCode(       value, numBits);
553   }
554 
555   // just convert image data from void*
556   const uint8_t* pixels = (const uint8_t*)pixels_;
557 
558   // the next two variables are frequently used when checking for image borders
559   const unsigned short maxWidth  = width  - 1; // "last row"
560   const unsigned short maxHeight = height - 1; // "bottom line"
561 
562   // process MCUs (minimum codes units) => image is subdivided into a grid of 8x8 or 16x16 tiles
563   const unsigned short sampling = downsample ? 2 : 1; // 1x1 or 2x2 sampling
564   const unsigned short mcuSize  = 8 * sampling;
565 
566   // average color of the previous MCU
567   int16_t lastYDC = 0, lastCbDC = 0, lastCrDC = 0;
568   // convert from RGB to YCbCr
569   float Y[8][8], Cb[8][8], Cr[8][8];
570 
571   for (unsigned short mcuY = 0; mcuY < height; mcuY += mcuSize) // each step is either 8 or 16 (=mcuSize)
572     for (unsigned short mcuX = 0; mcuX < width; mcuX += mcuSize)
573     {
574       // YCbCr 4:4:4 format: each MCU is a 8x8 block - the same applies to grayscale images, too
575       // YCbCr 4:2:0 format: each MCU represents a 16x16 block, stored as 4x 8x8 Y-blocks plus 1x 8x8 Cb and 1x 8x8 Cr block)
576       for (unsigned short blockY = 0; blockY < mcuSize; blockY += 8) // iterate once (YCbCr444 and grayscale) or twice (YCbCr420)
577         for (unsigned short blockX = 0; blockX < mcuSize; blockX += 8)
578         {
579           // now we finally have an 8x8 block ...
580           for (unsigned short deltaY = 0; deltaY < 8; deltaY++)
581           {
582             size_t column = minimum(uint16_t(mcuX + blockX)         , maxWidth); // must not exceed image borders, replicate last row/column if needed
583             size_t row    = minimum(uint16_t(mcuY + blockY + deltaY), maxHeight);
584             for (size_t deltaX = 0; deltaX < 8; deltaX++)
585             {
586               // find actual pixel position within the current image
587               size_t pixelPos = row * int(width) + column; // the cast ensures that we don't run into multiplication overflows
588               if (column < maxWidth)
589                 column++;
590 
591               // grayscale images have solely a Y channel which can be easily derived from the input pixel by shifting it by 128
592               if (!isRGB)
593               {
594                 Y[deltaY][deltaX] = pixels[pixelPos] - 128.f;
595                 continue;
596               }
597 
598               // RGB: 3 bytes per pixel (whereas grayscale images have only 1 byte per pixel)
599               uint8_t r = pixels[3 * pixelPos    ];
600               uint8_t g = pixels[3 * pixelPos + 1];
601               uint8_t b = pixels[3 * pixelPos + 2];
602 
603               Y   [deltaY][deltaX] = rgb2y (r, g, b) - 128; // again, the JPEG standard requires Y to be shifted by 128
604               // YCbCr444 is easy - the more complex YCbCr420 has to be computed about 20 lines below in a second pass
605               if (!downsample)
606               {
607                 Cb[deltaY][deltaX] = rgb2cb(r, g, b); // standard RGB-to-YCbCr conversion
608                 Cr[deltaY][deltaX] = rgb2cr(r, g, b);
609               }
610             }
611           }
612 
613         // encode Y channel
614         lastYDC = encodeBlock(bitWriter, Y, scaledLuminance, lastYDC, huffmanLuminanceDC, huffmanLuminanceAC, codewords);
615         // Cb and Cr are encoded about 50 lines below
616       }
617 
618       // grayscale images don't need any Cb and Cr information
619       if (!isRGB)
620         continue;
621 
622       // ////////////////////////////////////////
623       // the following lines are only relevant for YCbCr420:
624       // average/downsample chrominance of four pixels while respecting the image borders
625       if (downsample)
626         for (short deltaY = 7; downsample && deltaY >= 0; deltaY--) // iterating loop in reverse increases cache read efficiency
627         {
628           size_t row      = minimum(uint16_t(mcuY + 2*deltaY), maxHeight); // each deltaX/Y step covers a 2x2 area
629           size_t column   =         mcuX;                        // column is updated inside next loop
630           size_t pixelPos = (row * int(width) + column) * 3;     // numComponents = 3
631 
632           // deltas (in bytes) to next row / column, must not exceed image borders
633           size_t rowStep    = (row    < maxHeight) ? 3 * int(width) : 0; // always numComponents*width except for bottom    line
634           size_t columnStep = (column < maxWidth ) ? 3              : 0; // always numComponents       except for rightmost pixel
635 
636           for (short deltaX = 0; deltaX < 8; deltaX++)
637           {
638             // let's add all four samples (2x2 area)
639             size_t right     = pixelPos + columnStep;
640             size_t down      = pixelPos +              rowStep;
641             size_t downRight = pixelPos + columnStep + rowStep;
642 
643             // note: cast from 8 bits to >8 bits to avoid overflows when adding
644             short r = short(pixels[pixelPos    ]) + pixels[right    ] + pixels[down    ] + pixels[downRight    ];
645             short g = short(pixels[pixelPos + 1]) + pixels[right + 1] + pixels[down + 1] + pixels[downRight + 1];
646             short b = short(pixels[pixelPos + 2]) + pixels[right + 2] + pixels[down + 2] + pixels[downRight + 2];
647 
648             // convert to Cb and Cr
649             Cb[deltaY][deltaX] = rgb2cb(r, g, b) / 4; // I still have to divide r,g,b by 4 to get their average values
650             Cr[deltaY][deltaX] = rgb2cr(r, g, b) / 4; // it's a bit faster if done AFTER CbCr conversion
651 
652             // step forward to next 2x2 area
653             pixelPos += 2*3; // 2 pixels => 6 bytes (2*numComponents)
654             column   += 2;
655 
656             // reached right border ?
657             if (column >= maxWidth)
658             {
659               columnStep = 0;
660               pixelPos = ((row + 1) * int(width) - 1) * 3; // same as (row * width + maxWidth) * numComponents => current's row last pixel
661             }
662           }
663         } // end of YCbCr420 code for Cb and Cr
664 
665       // encode Cb and Cr
666       lastCbDC = encodeBlock(bitWriter, Cb, scaledChrominance, lastCbDC, huffmanChrominanceDC, huffmanChrominanceAC, codewords);
667       lastCrDC = encodeBlock(bitWriter, Cr, scaledChrominance, lastCrDC, huffmanChrominanceDC, huffmanChrominanceAC, codewords);
668     }
669 
670   bitWriter.flush(); // now image is completely encoded, write any bits still left in the buffer
671 
672   // ///////////////////////////
673   // EOI marker
674   bitWriter << 0xFF << 0xD9; // this marker has no length, therefore I can't use addMarker()
675   return true;
676 } // writeJpeg()
677 
678 }}
679